# Notebook for classifying claims and non-claims, based on features and embeddings

In [1]:
import pandas as pd
import numpy as np
import spacy

from scripts.load_corpus import downsample, DaxenbergerModified, StabGurevychCorpus
from scripts.load_embeddings import GloVe, Word2Vec
from scripts.stop_words import StopWords

from sklearn.base import TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Loading corpus with script

In [2]:
#corpus_1 = StabGurevychCorpus()
#df_all = corpus_1.df_all
#print(df_all.head)

corpus_2 = DaxenbergerModified()
df_all = corpus_2.df_all
print(df_all.head)

<bound method NDFrame.head of                                                    text  target
0     How can anyone expect children could do well a...       0
1     Firstly , I think that the new high school wil...       1
2     With technological advances , children have mo...       0
3     Nowadays , many professors conduct research wh...       0
4     In today ' s world there are many great and us...       0
...                                                 ...     ...
7046  Last but not least , knowledge is worth mentio...       1
7047  To illustrate this point , I can write about m...       0
7048  Consider a circumstance in which a student who...       0
7049  in my opinion , reducing stress by listening t...       1
7050  In addition , the basic economic course can al...       1

[7051 rows x 2 columns]>


Loading embeddings with script

In [3]:
glove = GloVe()
glove_embeddings = glove.load_model()
print("### GloVe embeddings: ###")
#print(list(glove_embeddings.items())[:2])

word2vec = Word2Vec()
word2vec_embeddings = word2vec.load_model()
print("### Word2Vec embeddings: ###")
#print(word2vec_embeddings.most_similar('toronto'))

Loading GloVe from file...
### GloVe embeddings: ###
Loading Word2Vec from file...
### Word2Vec embeddings: ###


Define custom tokenizer for later preprocessing

In [4]:
# list of punctuation marks
#punctuations = string.punctuation

load_model = spacy.load("en_core_web_sm", disable = ['parser','ner'])


def custom_tokenizer(sentence):
    """
    A customizable tokenizer to lemmatize words or remove stop words.

    input: str of tokens
    return: list with lemmatized tokens
    """
    tokens = load_model(sentence)

    # SpaCy stopwords without conjunctions and conjunctive adverbs
    test_stop_words = StopWords()
    
    #stop_words = spacy.lang.en.stop_words.STOP_WORDS # all stop words
    #stop_words = test_stop_words.STOP_WORDS # stop words without conjunctions
    stop_words = test_stop_words.ALL_CONJUNCTIONS
    
    
    
    # lemmatizing each word and converting it to lower case
    tokens = [token.lemma_ for token in tokens]

    # removing stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

test_sent = "However, dogs will improve the qualities of both universities and students, thus placing people in positions which suit them most"

print(custom_tokenizer(test_sent))

def pos_tagging(sentence):
    """
    A simple POS tagger, converting each word to its POS tag according to
    SpaCy.

    input: str of tokens
    return: list of POS tags for each token
    """
    tokens = load_model(sentence)
    tokens = [token.pos_ for token in tokens]
    
    return tokens


[',', 'dog', 'will', 'improve', 'the', 'quality', 'of', 'both', 'university', 'student', ',', 'place', 'people', 'in', 'position', 'which', 'suit', 'they', 'most']


Initializing scikit-learn Transformers for custom features

In [5]:
load_model = spacy.load("en_core_web_sm", disable=['parser','ner'])


class StatementLen(TransformerMixin):
    """
    This class converts input text to its corresponding token length.
    """
    def fit(self, X, y=None):
        return self
    
    def text_len(self, text):
        return len(text.split())
    
    def transform(self, X):
        """
        Transform instances to their corresponding token length

        input: str of tokens
        return: dict with token length per instance
        """
        feature_dicts = []

        for text in X:
            feature_dict = {
                "n_words_in_sentence": self.text_len(text)
            }
            feature_dicts.append(feature_dict)

        return feature_dicts


class GloVeEmbeddings(TransformerMixin):
    """ 
    This class helps with transforming text to corresponding embeddings
    """
    def __init__(self):
        pass
    
    def fit(self, X, y):
        return self
    
    def embed(self, text):
        """
        Creates phrase embeddings with GloVe, leveraging the linearity of 
        word vectors.

        input: str
        return: numpy array for phrase embedding of str
        """
        embedding_dim = 300
        phrase_embedding = np.zeros(embedding_dim)
        count = 0

        for word in text:
            try:
                phrase_embedding = np.add(phrase_embedding, glove_embeddings[word])
            except KeyError:
                continue
            
            count += 1

        #if count != 0:
        #    phrase_embedding = phrase_embedding / count

        return phrase_embedding


    def transform(self, X):
        """
        Transform each instance into corresponding phrase embedding

        input: list of strings
        return: list of phrase embeddings
        """
        embeddings = []
        
        for text in X:

            # GloVe phrase embedding
            embeddings.append(self.embed(text))

        return embeddings


class Word2VecEmbeddings(TransformerMixin):
    """ 
    This class helps with transforming text to corresponding embeddings
    """
    def __init__(self):
        pass
    
    def fit(self, X, y):
        return self
    
    def embed(self, text):
        """
        Creates phrase embeddings with Word2Vec, leveraging the linearity of 
        word vectors.

        input: str
        return: numpy array for phrase embedding of str
        """
        embedding_dim = 300
        phrase_embedding = np.zeros(embedding_dim)
        count = 0

        for word in text:
            try:
                phrase_embedding = np.add(phrase_embedding, word2vec_embeddings[word])
            except:
                continue
            
            count += 1

        #if count != 0:
        #    phrase_embedding = phrase_embedding / count

        return phrase_embedding


    def transform(self, X):
        """
        Transform each instance into corresponding phrase embedding

        input: list of strings
        return: list of phrase embeddings
        """
        embeddings = []
        
        for text in X:

            # Word2Vec phrase embedding
            embeddings.append(self.embed(text))

        return embeddings


# testing methods

test_sent = "This will improve the quality of both universities and students, thus placing people in positions which suit them most"

len_test = StatementLen()
print(len_test.text_len(test_sent))

19


Splitting data into train, validation, and test set, applying single 
downsampling

In [6]:
X = df_all["text"]
y = df_all["target"]

X_train, X_rem, y_train, y_rem = train_test_split(X, y,
                                                  train_size=0.7,
                                                  random_state=42)

# downsampling after split:

X_tr = pd.concat([X_train, y_train], axis=1)
print(X_tr.shape)

downsample_man = downsample(X_tr)
X_train = downsample_man["text"]
y_train = downsample_man["target"]
print(X_train.shape)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, 
                                                    test_size=0.5,
                                                    random_state=42)

(4935, 2)
(2980,)


Defining classifier

In [7]:
# Logistic Regression classifier

classifier_LR = LogisticRegression(solver='lbfgs', max_iter=1000)

Creating pipeline union

In [8]:
bow_pipe = Pipeline([ 
    ("vectorizer", CountVectorizer(tokenizer=custom_tokenizer,
                                   ngram_range=(1,2),
                                   token_pattern=None)),
    ]
)

glove_pipe = Pipeline([
    ("phrase_embeddings", GloVeEmbeddings()),
    ]
)

word2vec_pipe = Pipeline([
    ("phrase_embeddings", Word2VecEmbeddings()),
    ]
)

statement_len_pipe = Pipeline([
    ("len", StatementLen()),
    ("dict_vectorizer", DictVectorizer()),
    ]
)

pos_pipe = Pipeline([
    ("vectorizer", CountVectorizer(tokenizer=pos_tagging,
                                   ngram_range=(1,1),
                                   token_pattern=None)),
    ]
)

combined_pipe = FeatureUnion(
    transformer_list=[
        ("bow", bow_pipe),
        #("glove_embeddings", glove_pipe),
        #("word2vec_embeddings", word2vec_pipe),
        #("pos_tag", pos_pipe),
        ("statement_len", statement_len_pipe),
    ]
)

pipe = Pipeline([
    ("combined_pipe", combined_pipe),
    #("standardscaler", StandardScaler(with_mean=False)),
    ("classifier", classifier_LR),
    ]
)

#print(X_train.head(), y_train.head())

pipe.fit(X_train, y_train)


Evaluation with validation set:

Prediction results: \
1 = claim \
0 = non-claim

In [9]:
y_pred_valid = pipe.predict(X_valid)

#for (sample, pred) in zip(X_valid, y_pred):
#    print(f"{sample} \n Prediction => {pred}")

cr = metrics.classification_report(y_valid.tolist(), y_pred_valid, digits=4)
print(cr)


              precision    recall  f1-score   support

           0     0.8544    0.7200    0.7815       750
           1     0.5070    0.7013    0.5886       308

    accuracy                         0.7146      1058
   macro avg     0.6807    0.7106    0.6850      1058
weighted avg     0.7533    0.7146    0.7253      1058



Evaluation with test set:

Prediction results: \
1 = claim \
0 = non-claim

In [10]:
y_pred_test = pipe.predict(X_test)

cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)


              precision    recall  f1-score   support

           0     0.8592    0.7117    0.7785       763
           1     0.4836    0.6983    0.5714       295

    accuracy                         0.7079      1058
   macro avg     0.6714    0.7050    0.6750      1058
weighted avg     0.7544    0.7079    0.7208      1058



 10 fold cross-validation with stratification

In [11]:
# manual setup with downsampling

sk_folds = StratifiedKFold(n_splits = 10)
recall0_all = []
recall1_all = []
precision0_all = []
precision1_all = []

for i_train, i_test in sk_folds.split(X, y):
    X_train = df_all.iloc[i_train]
    X_test = df_all.iloc[i_test]

    down_sampled_X = downsample(X_train)

    pipe.fit(down_sampled_X["text"], down_sampled_X["target"])
    
    predictions = pipe.predict(X_test["text"])

    recall0 = recall_score(X_test["target"], predictions, pos_label=0)
    recall0_all.append(recall0)
    recall1 = recall_score(X_test["target"], predictions, pos_label=1)
    recall1_all.append(recall1)

    precision0 = precision_score(X_test["target"], predictions, pos_label=0)
    precision0_all.append(precision0)
    precision1 = precision_score(X_test["target"], predictions, pos_label=1)
    precision1_all.append(precision1)

print("Recall 0: " + str(np.mean(recall0_all)))
print("Recall 1: " + str(np.mean(recall1_all)))

macro_recall = (np.mean(recall0_all) + np.mean(recall1_all))/2
print("Macro Recall:" + str(macro_recall))

print("Precision 0: " + str(np.mean(precision0_all)))
print("Precision 1: " + str(np.mean(precision1_all)))

macro_precision = (np.mean(precision0_all) + np.mean(precision1_all))/2
print("Macro Precision:" + str(macro_precision))

f1_0 = 2 * (np.mean(precision0_all) * np.mean(recall0_all) / (np.mean(precision0_all)+ np.mean(recall0_all)))
f1_1 = 2 * (np.mean(precision1_all) * np.mean(recall1_all) / (np.mean(precision1_all)+ np.mean(recall1_all)))
macro_f1 = 2 * ((macro_precision * macro_precision) / (macro_precision + macro_precision))

print("F1 score for 0: " + str(f1_0))
print("F1 score for 1: " + str(f1_1))
print("Macro F1: " + str(macro_f1))

Recall 0: 0.7277069077875531
Recall 1: 0.7027842333105492
Macro Recall:0.7152455705490511
Precision 0: 0.8533068561392128
Precision 1: 0.5214824421951112
Macro Precision:0.687394649167162
F1 score for 0: 0.7855178845916085
F1 score for 1: 0.5987088363270728
Macro F1: 0.687394649167162
