# Notebook for classifying claims and non-claims, based on features and embeddings

In [1]:
import spacy
import numpy as np

from scripts.load_corpus import DaxenbergerModified
from scripts.load_corpus import StabGurevychCorpus
from scripts.load_embeddings import GloVe
from scripts.load_embeddings import Word2Vec
from scripts.stop_words import StopWords

from sklearn.base import TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

Loading corpus with script

In [2]:
#corpus_1 = StabGurevychCorpus()
#df_all = corpus_1.df_all
#print(df_all.head)

corpus_2 = DaxenbergerModified()
df_all = corpus_2.df_all
print(df_all.head)

Length of minority and majority 

<bound method NDFrame.head of                                                    text  target
0     Finally , some assert that the ability to plan...       0
1     For example , a girl , who is interested in li...       0
2     As a result , experiences of difficulties teac...       1
3     Nowadays , the difference between school and s...       0
4     Last but not least , the Internet offers a mor...       0
...                                                 ...     ...
4181  To give a brief conclusion , Modern technogy h...       1
4182  The main advantage of high-tech medical care i...       1
4183  Nobody likes attending boring conferences , or...       0
4184  Hence , from this case , we are capable of sta...       1
4185  However , since the budget and the recourse ar...       0

[4186 rows x 2 columns]>


Loading embeddings with script

In [3]:
glove = GloVe()
glove_embeddings = glove.load_model()
print("### GloVe embeddings: ###")
print(list(glove_embeddings.items())[:2])

word2vec = Word2Vec()
word2vec_embeddings = word2vec.load_model()
print("### Word2Vec embeddings: ###")
print(word2vec_embeddings.most_similar('toronto'))

Loading GloVe from file...
### GloVe embeddings: ###
[('the', array([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
        2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
       -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
        2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
       -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
       -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
        9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
       -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
        1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
        2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
        1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
        2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
       -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
  

Define custom tokenizer for later preprocessing

In [4]:
# list of punctuation marks
#punctuations = string.punctuation

load_model = spacy.load("en_core_web_sm", disable = ['parser','ner'])


def custom_tokenizer(sentence):
    """
    A customizable tokenizer to lemmatize words or remove stop words.

    input: str of tokens
    return: list with lemmatized tokens
    """
    tokens = load_model(sentence)

    # SpaCy stopwords without conjunctions and conjunctive adverbs
    test_stop_words = StopWords()
    
    #stop_words = spacy.lang.en.stop_words.STOP_WORDS # all stop words
    #stop_words = test_stop_words.STOP_WORDS # stop words without conjunctions
    #stop_words = test_stop_words.ALL_CONJUNCTIONS
    
    
    
    # lemmatizing each word and converting it to lower case
    tokens = [token.lemma_ for token in tokens]

    # removing stop words
    #tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

test_sent = "However, dogs will improve the qualities of both universities and students, thus placing people in positions which suit them most"

print(custom_tokenizer(test_sent))

def pos_tagging(sentence):
    """
    A simple POS tagger, converting each word to its POS tag according to
    SpaCy.

    input: str of tokens
    return: list of POS tags for each token
    """
    tokens = load_model(sentence)
    tokens = [token.pos_ for token in tokens]
    
    return tokens


['however', ',', 'dog', 'will', 'improve', 'the', 'quality', 'of', 'both', 'university', 'and', 'student', ',', 'thus', 'place', 'people', 'in', 'position', 'which', 'suit', 'they', 'most']


Initializing scikit-learn Transformers for custom features

In [5]:
load_model = spacy.load("en_core_web_sm", disable=['parser','ner'])


class StatementLen(TransformerMixin):
    """
    This class converts input text to its corresponding token length.
    """
    def fit(self, X, y=None):
        return self
    
    def text_len(self, text):
        return len(text.split())
    
    def transform(self, X):
        """
        Transform instances to their corresponding token length

        input: str of tokens
        return: dict with token length per instance
        """
        feature_dicts = []

        for text in X:
            feature_dict = {
                "n_words_in_sentence": self.text_len(text)
            }
            feature_dicts.append(feature_dict)

        return feature_dicts


class GloVeEmbeddings(TransformerMixin):
    """ 
    This class helps with transforming text to corresponding embeddings
    """
    def __init__(self):
        pass
    
    def fit(self, X, y):
        return self
    
    def embed(self, text):
        """
        Creates phrase embeddings with GloVe, leveraging the linearity of 
        word vectors.

        input: str
        return: numpy array for phrase embedding of str
        """
        embedding_dim = 300
        phrase_embedding = np.zeros(embedding_dim)
        count = 0

        for word in text:
            try:
                phrase_embedding = np.add(phrase_embedding, glove_embeddings[word])
            except KeyError:
                continue
            
            count += 1

        #if count != 0:
        #    phrase_embedding = phrase_embedding / count

        return phrase_embedding


    def transform(self, X):
        """
        Transform each instance into corresponding phrase embedding

        input: list of strings
        return: list of phrase embeddings
        """
        embeddings = []
        
        for text in X:

            # GloVe phrase embedding
            embeddings.append(self.embed(text))

        return embeddings


class Word2VecEmbeddings(TransformerMixin):
    """ 
    This class helps with transforming text to corresponding embeddings
    """
    def __init__(self):
        pass
    
    def fit(self, X, y):
        return self
    
    def embed(self, text):
        """
        Creates phrase embeddings with Word2Vec, leveraging the linearity of 
        word vectors.

        input: str
        return: numpy array for phrase embedding of str
        """
        embedding_dim = 300
        phrase_embedding = np.zeros(embedding_dim)
        count = 0

        for word in text:
            try:
                phrase_embedding = np.add(phrase_embedding, word2vec_embeddings[word])
            except:
                continue
            
            count += 1

        #if count != 0:
        #    phrase_embedding = phrase_embedding / count

        return phrase_embedding


    def transform(self, X):
        """
        Transform each instance into corresponding phrase embedding

        input: list of strings
        return: list of phrase embeddings
        """
        embeddings = []
        
        for text in X:

            # Word2Vec phrase embedding
            embeddings.append(self.embed(text))

        return embeddings


# testing methods

test_sent = "This will improve the quality of both universities and students, thus placing people in positions which suit them most"

len_test = StatementLen()
print(len_test.text_len(test_sent))

19


Splitting data into train, validation, and test set

In [6]:
X = df_all["text"]
y = df_all["target"]

X_train, X_rem, y_train, y_rem = train_test_split(X, y,
                                                  train_size=0.7,
                                                  random_state=42)
print(X_train.shape)
print(X_rem.shape)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, 
                                                    test_size=0.5,
                                                    random_state=42)
print(X_valid.shape)
print(X_test.shape)

(2930,)
(1256,)
(628,)
(628,)


Defining classifier

In [7]:
# Logistic Regression classifier

classifier_LR = LogisticRegression(solver='lbfgs', max_iter=1000)

Creating pipeline union

In [8]:
bow_pipe = Pipeline([ 
    ("vectorizer", CountVectorizer(tokenizer=custom_tokenizer,
                                   ngram_range=(1,2),
                                   token_pattern=None)),
    ]
)

glove_pipe = Pipeline([
    ("phrase_embeddings", GloVeEmbeddings()),
    ]
)

word2vec_pipe = Pipeline([
    ("phrase_embeddings", Word2VecEmbeddings()),
    ]
)

statement_len_pipe = Pipeline([
    ("len", StatementLen()),
    ("dict_vectorizer", DictVectorizer()),
    ]
)

pos_pipe = Pipeline([
    ("vectorizer", CountVectorizer(tokenizer=pos_tagging,
                                   ngram_range=(1,1),
                                   token_pattern=None)),
    ]
)

combined_pipe = FeatureUnion(
    transformer_list=[
        ("bow", bow_pipe),
        #("glove_embeddings", glove_pipe),
        #("word2vec_embeddings", word2vec_pipe),
        ("pos_tag", pos_pipe),
        #("statement_len", statement_len_pipe),
    ]
)

pipe = Pipeline([
    ("combined_pipe", combined_pipe),
    #("standardscaler", StandardScaler(with_mean=False)),
    ("classifier", classifier_LR),
    ]
)

#print(X_train.head(), y_train.head())

pipe.fit(X_train, y_train)


Evaluation with validation set:

Prediction results: \
1 = claim \
0 = non-claim

In [9]:
y_pred_valid = pipe.predict(X_valid)

#for (sample, pred) in zip(X_valid, y_pred):
#    print(f"{sample} \n Prediction => {pred}")

cr = metrics.classification_report(y_valid.tolist(), y_pred_valid, digits=4)
print(cr)


              precision    recall  f1-score   support

           0     0.6895    0.7857    0.7344       308
           1     0.7617    0.6594    0.7069       320

    accuracy                         0.7213       628
   macro avg     0.7256    0.7225    0.7207       628
weighted avg     0.7263    0.7213    0.7204       628



Evaluation with test set:

Prediction results: \
1 = claim \
0 = non-claim

In [10]:
y_pred_test = pipe.predict(X_test)

cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)


              precision    recall  f1-score   support

           0     0.7370    0.7876    0.7615       306
           1     0.7841    0.7329    0.7576       322

    accuracy                         0.7596       628
   macro avg     0.7605    0.7603    0.7595       628
weighted avg     0.7611    0.7596    0.7595       628



 10 fold cross-validation with stratification

In [11]:
sk_folds = StratifiedKFold(n_splits = 10)

scoring = {'recall0': make_scorer(recall_score, average=None, labels=[0]),
       'recall1': make_scorer(recall_score, average=None, labels=[1]),
       'precision0': make_scorer(precision_score, average=None, labels=[0]),
       'precision1': make_scorer(precision_score, average=None, labels=[1]),
       }

scores = cross_validate(pipe, X, y,
                        scoring=scoring,
                        cv=sk_folds,
                        return_train_score=False)

print(scores.keys())

print("Recall 0: " + str(scores["test_recall0"].mean()))
print("Recall 1: " + str(scores["test_recall1"].mean()))

macro_recall = (scores["test_recall0"].mean() + scores["test_recall1"].mean())/2

print("Macro Recall: " + str(macro_recall))


print("Precision 0: " + str(scores["test_precision0"].mean()))
print("Precision 1: " + str(scores["test_precision1"].mean()))

macro_precision = (scores["test_precision0"].mean() + scores["test_precision1"].mean())/2

print("Macro Precision: " + str(macro_precision))

f1_0 = 2 * (scores["test_precision0"].mean() * scores["test_recall0"].mean() / (scores["test_precision0"].mean() + scores["test_recall0"].mean()))
f1_1 = 2 * (scores["test_precision1"].mean() * scores["test_recall1"].mean() / (scores["test_precision1"].mean() + scores["test_recall1"].mean()))
macro_f1 = 2 * ((macro_precision * macro_precision) / (macro_precision + macro_precision))

print("F1 score for 0: " + str(f1_0))
print("F1 score for 1: " + str(f1_1))
print("Macro F1: " + str(macro_f1))
      

dict_keys(['fit_time', 'score_time', 'test_recall0', 'test_recall1', 'test_precision0', 'test_precision1'])
Recall 0: 0.7496582365003418
Recall 1: 0.7229095465937572
Macro Recall: 0.7362838915470495
Precision 0: 0.7305730165655127
Precision 1: 0.7436494533133711
Macro Precision: 0.7371112349394419
F1 score for 0: 0.7399925898049815
F1 score for 1: 0.733132849279856
Macro F1: 0.7371112349394419
