# Scikit-Learn Classifier on Word/Document Embeddings

In [1]:
import pandas as pd
import numpy as np
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from sklearn.metrics import classification_report
import joblib
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm

## Load and preprocess data

In [2]:
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0)
claims.shape

(40608, 5)

In [3]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

In [4]:
X = claims[train_idx]["claim"].values
y = claims[train_idx]["truth_rating"].values
# validation data
X_val = claims[val_idx]["claim"].values
y_val = claims[val_idx]["truth_rating"].values
X.shape, y.shape, X_val.shape, y_val.shape

((37983,), (37983,), (2625,), (2625,))

## Fit the Vectorizer

In [5]:
def prepare_corpus(claims, test=False):
    for i, claim in enumerate(claims):
        tokenized_claim = gensim.utils.simple_preprocess(claim)
        if test:
            yield tokenized_claim
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokenized_claim, [i])

In [6]:
train_corpus = list(prepare_corpus(X))

In [7]:
train_corpus[:3]

[TaggedDocument(words=['the', 'epa', 'wants', 'to', 'hire', 'new', 'government', 'regulators', 'that', 'will', 'cost', 'the', 'taxpayer', 'billion'], tags=[0]),
 TaggedDocument(words=['actual', 'video', 'of', 'iraqi', 'soldier', 'saying', 'goodbye', 'to', 'his', 'family'], tags=[1]),
 TaggedDocument(words=['bus', 'launched', 'in', 'august', 'in', 'pakistan', 'falls', 'into', 'sinkhole'], tags=[2])]

## Train Doc2vec model

In [71]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=3, epochs=100)

2023-01-11 20:56:12,184 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc3,s0.001,t3>', 'datetime': '2023-01-11T20:56:12.184215', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [72]:
model.build_vocab(train_corpus)

2023-01-11 20:56:12,406 : INFO : collecting all words and their counts
2023-01-11 20:56:12,408 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-01-11 20:56:12,599 : INFO : PROGRESS: at example #10000, processed 154422 words (812000 words/s), 15513 word types, 0 tags
2023-01-11 20:56:12,844 : INFO : PROGRESS: at example #20000, processed 307227 words (626396 words/s), 21011 word types, 0 tags
2023-01-11 20:56:13,061 : INFO : PROGRESS: at example #30000, processed 460075 words (712391 words/s), 24905 word types, 0 tags
2023-01-11 20:56:13,177 : INFO : collected 27492 word types and 37983 unique tags from a corpus of 37983 examples and 578574 words
2023-01-11 20:56:13,178 : INFO : Creating a fresh vocabulary
2023-01-11 20:56:13,341 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=3 retains 12669 unique words (46.08% of original 27492, drops 14823)', 'datetime': '2023-01-11T20:56:13.341762', 'gensim': '4.3.0', 'python': '3.10.4 | packa

In [73]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2023-01-11 20:56:13,945 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 12669 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-11T20:56:13.945762', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-01-11 20:56:15,097 : INFO : EPOCH 0 - PROGRESS: at 17.00% examples, 75664 words/s, in_qsize 5, out_qsize 0
2023-01-11 20:56:16,151 : INFO : EPOCH 0 - PROGRESS: at 35.78% examples, 82574 words/s, in_qsize 6, out_qsize 0
2023-01-11 20:56:17,160 : INFO : EPOCH 0 - PROGRESS: at 49.65% examples, 78094 words/s, in_qsize 5, out_qsize 0
2023-01-11 20:56:18,219 : INFO : EPOCH 0 - PROGRESS: at 66.90% examples, 78878 words/s, in_qsize 5, out_qsize 0
2023-01-11 20:56:19,242 : INFO : EPOCH 0 - PROGRESS: at 80.63% examples, 76670 words/s, in_qsize 6, out_qsize 0
2023-01-1

In [74]:
model.save("doc2vec_model_100d_min3_100e.gensim")

2023-01-11 21:16:26,835 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_model_100d_min3_200e.gensim', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-01-11T21:16:26.835553', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-01-11 21:16:26,836 : INFO : not storing attribute cum_table
2023-01-11 21:16:26,923 : INFO : saved doc2vec_model_100d_min3_200e.gensim


### Load pre-trained word vectors

Download pre-trained static word embeddings from a github repository vai gensim. Word vectors were trained on Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased).

In [5]:
import gensim.downloader as api

In [6]:
model = api.load("glove-twitter-100")

2023-01-14 08:29:53,904 : INFO : loading projection weights from C:\Users\nickr/gensim-data\glove-twitter-100\glove-twitter-100.gz
2023-01-14 08:35:13,024 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (1193514, 100) matrix of type float32 from C:\\Users\\nickr/gensim-data\\glove-twitter-100\\glove-twitter-100.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-01-14T08:35:13.024175', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}


In [143]:
model.get_vector("test").shape

(100,)

### Train own FASTTEXT model

In [44]:
from gensim.models.fasttext import FastText

In [45]:
fasttext_corpus = [claim for claim in X]

In [46]:
fasttext = FastText(vector_size=100, epochs=100)

2023-01-12 00:07:54,682 : INFO : FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-01-12T00:07:54.682009', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [47]:
fasttext.build_vocab(fasttext_corpus)

2023-01-12 00:07:56,208 : INFO : collecting all words and their counts
2023-01-12 00:07:56,213 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-01-12 00:07:56,429 : INFO : PROGRESS: at sentence #10000, processed 995085 words, keeping 155 word types
2023-01-12 00:07:56,651 : INFO : PROGRESS: at sentence #20000, processed 1980526 words, keeping 183 word types
2023-01-12 00:07:56,882 : INFO : PROGRESS: at sentence #30000, processed 2966833 words, keeping 192 word types
2023-01-12 00:07:57,038 : INFO : collected 204 word types from a corpus of 3726818 raw words and 37983 sentences
2023-01-12 00:07:57,039 : INFO : Creating a fresh vocabulary
2023-01-12 00:07:57,041 : INFO : FastText lifecycle event {'msg': 'effective_min_count=5 retains 115 unique words (56.37% of original 204, drops 89)', 'datetime': '2023-01-12T00:07:57.041040', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platf

In [235]:
fasttext.train(fasttext_corpus,
               epochs= fasttext.epochs, 
               total_examples= fasttext.corpus_count,
               total_words= fasttext.corpus_total_words)

2023-01-11 23:49:31,618 : INFO : FastText lifecycle event {'msg': 'training model with 3 workers on 115 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-11T23:49:31.618220', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
2023-01-11 23:49:32,652 : INFO : EPOCH 0 - PROGRESS: at 28.70% examples, 252474 words/s, in_qsize 5, out_qsize 0
2023-01-11 23:49:33,659 : INFO : EPOCH 0 - PROGRESS: at 64.03% examples, 282395 words/s, in_qsize 5, out_qsize 0
2023-01-11 23:49:34,685 : INFO : EPOCH 0 - PROGRESS: at 98.70% examples, 287631 words/s, in_qsize 4, out_qsize 0
2023-01-11 23:49:34,697 : INFO : EPOCH 0: training on 3726818 raw words (887433 effective words) took 3.1s, 289367 effective words/s
2023-01-11 23:49:35,746 : INFO : EPOCH 1 - PROGRESS: at 27.60% examples, 242964 words/s, in_qsize 

(88695688, 372681800)

In [236]:
#fasttext.save("fasttext-100d-100e.gensim")

2023-01-11 23:54:17,019 : INFO : FastText lifecycle event {'fname_or_handle': 'fasttext-100d-100e.gensim', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-01-11T23:54:17.019278', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-01-11 23:54:17,020 : INFO : storing np array 'vectors_ngrams' to fasttext-100d-100e.gensim.wv.vectors_ngrams.npy
2023-01-11 23:54:19,538 : INFO : not storing attribute buckets_word
2023-01-11 23:54:19,539 : INFO : not storing attribute vectors
2023-01-11 23:54:19,540 : INFO : not storing attribute cum_table
2023-01-11 23:54:19,547 : INFO : saved fasttext-100d-100e.gensim


## Vectorize data

In [163]:
#model = gensim.models.doc2vec.Doc2Vec.load("doc2vec_model_100d_min3_100e.gensim")

In [5]:
# glove
model = gensim.models.keyedvectors.KeyedVectors.load("glove-twitter-100.gensim")

2023-01-12 00:34:34,693 : INFO : loading KeyedVectors object from glove-twitter-100.gensim
2023-01-12 00:34:35,946 : INFO : loading vectors from glove-twitter-100.gensim.vectors.npy with mmap=None
2023-01-12 00:34:36,374 : INFO : KeyedVectors lifecycle event {'fname': 'glove-twitter-100.gensim', 'datetime': '2023-01-12T00:34:36.374690', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


In [10]:
# fasttext
# model = gensim.models.fasttext.FastText.load("fasttext-100d-100e.gensim")

2023-01-12 00:02:40,497 : INFO : loading FastText object from fasttext-100d-100e.gensim
2023-01-12 00:02:40,502 : INFO : loading wv recursively from fasttext-100d-100e.gensim.wv.* with mmap=None
2023-01-12 00:02:40,503 : INFO : loading vectors_ngrams from fasttext-100d-100e.gensim.wv.vectors_ngrams.npy with mmap=None
2023-01-12 00:02:41,362 : INFO : setting ignored attribute buckets_word to None
2023-01-12 00:02:41,363 : INFO : setting ignored attribute vectors to None
2023-01-12 00:02:41,367 : INFO : setting ignored attribute cum_table to None
2023-01-12 00:02:41,369 : INFO : FastText lifecycle event {'fname': 'fasttext-100d-100e.gensim', 'datetime': '2023-01-12T00:02:41.369268', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


In [6]:
def vectorize_claim(claim):
    tokenized_claim = gensim.utils.simple_preprocess(claim)
    wv = model.infer_vector(tokenized_claim)
    return wv

In [7]:
def get_wv(w):
    try:
        return model.get_vector(w)
    except:
        return np.zeros((100,))

In [8]:
def retrieve_wv_average(claim):
    tokenized_claim = gensim.utils.simple_preprocess(claim)
    wv = np.array([get_wv(token) for token in tokenized_claim]).mean(axis=0)
    # this line for fasttext
    #wv = np.array([model.wv[token] for token in tokenized_claim]).mean(axis=0)
    return wv

In [9]:
def retrieve_wv_multiply(claim):
    tokenized_claim = gensim.utils.simple_preprocess(claim)
    wv = np.array([get_wv(token) for token in tokenized_claim])
    # this line for fasttext
    #wv = np.array([model.wv[token] for token in tokenized_claim])
    return np.prod(wv, axis=0)

In [10]:
X_glove = np.vstack([retrieve_wv_average(claim) for claim in X])
X_glove_val = np.vstack([retrieve_wv_average(claim) for claim in X_val])
X_glove.shape, X_glove_val.shape

((37983, 100), (2625, 100))

In [87]:
X_ = np.vstack([vectorize_claim(claim) for claim in X])
X_val_ = np.vstack([vectorize_claim(claim) for claim in X_val])
X_.shape, X_val_.shape

((37983, 100), (2625, 100))

## Fit a Classifier

In [11]:
def train_eval(clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    print(classification_report(y_true=y_val,y_pred=preds))

In [19]:
paclf = PassiveAggressiveClassifier(random_state=417, n_jobs=-1, C=0.1) 
knn = KNeighborsClassifier(n_jobs=-1) 
rfc = RandomForestClassifier(random_state=417, n_jobs=-1, criterion="entropy") 
sgd = SGDClassifier(random_state=417, n_jobs=-1, loss="modified_huber")
mlp = MLPClassifier(random_state=417)

In [93]:
train_eval(knn, X_glove, y, X_glove_val, y_val)

              precision    recall  f1-score   support

       FALSE       0.86      0.81      0.84      2119
       OTHER       0.33      0.40      0.36       396
        TRUE       0.14      0.16      0.15       110

    accuracy                           0.72      2625
   macro avg       0.44      0.46      0.45      2625
weighted avg       0.75      0.72      0.74      2625





In [114]:
ps = [(10,), (20,), (30,), (40,)]
for p in ps:
    clf = MLPClassifier(random_state=417, hidden_layer_sizes=p)
    print(clf)
    train_eval(clf, X_, y, X_val_, y_val)
    print("_"*100)

MLPClassifier(hidden_layer_sizes=(10,), random_state=417)
              precision    recall  f1-score   support

       FALSE       0.84      0.78      0.81      2119
       OTHER       0.22      0.23      0.23       396
        TRUE       0.13      0.29      0.18       110

    accuracy                           0.68      2625
   macro avg       0.40      0.43      0.41      2625
weighted avg       0.72      0.68      0.70      2625

____________________________________________________________________________________________________
MLPClassifier(hidden_layer_sizes=(20,), random_state=417)
              precision    recall  f1-score   support

       FALSE       0.85      0.71      0.77      2119
       OTHER       0.22      0.30      0.26       396
        TRUE       0.10      0.32      0.16       110

    accuracy                           0.63      2625
   macro avg       0.39      0.44      0.40      2625
weighted avg       0.72      0.63      0.67      2625

_____________________



              precision    recall  f1-score   support

       FALSE       0.85      0.67      0.75      2119
       OTHER       0.22      0.30      0.25       396
        TRUE       0.09      0.35      0.15       110

    accuracy                           0.60      2625
   macro avg       0.39      0.44      0.38      2625
weighted avg       0.73      0.60      0.65      2625

____________________________________________________________________________________________________
MLPClassifier(hidden_layer_sizes=(40,), random_state=417)
              precision    recall  f1-score   support

       FALSE       0.85      0.60      0.70      2119
       OTHER       0.20      0.35      0.26       396
        TRUE       0.10      0.39      0.16       110

    accuracy                           0.55      2625
   macro avg       0.38      0.45      0.37      2625
weighted avg       0.72      0.55      0.61      2625

_______________________________________________________________________________



### Voting Classifier

In [13]:
from sklearn.ensemble import VotingClassifier

In [24]:
vtn = VotingClassifier(estimators= [("1", mlp), ("2", knn)], voting="soft", n_jobs=-1)
vtn.fit(X_glove, y)
pred = vtn.predict(X_glove_val)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

       FALSE       0.85      0.92      0.88      2119
       OTHER       0.44      0.31      0.37       396
        TRUE       0.21      0.10      0.13       110

    accuracy                           0.79      2625
   macro avg       0.50      0.44      0.46      2625
weighted avg       0.76      0.79      0.77      2625



### Custom KNN clf

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

In [76]:
class Cosine_KNN():
    def __init__(self, n_neighbors) -> None:
        self.n_neighbors = n_neighbors

    def fit(self, wv, y):
        # just save training embeddings and labels
        self.wv = wv
        self.y = y
    
    def predict(self, X_test):
        similarities = cosine_similarity(X_test, self.wv)
        sorted_similarities = np.argsort(similarities, axis=0)
        n_closest = sorted_similarities[:, -self.n_neighbors:]
        labels = np.array([pd.Series(row).value_counts().index[0] for row in self.y[n_closest]])
        return labels

The following KNN was an idea to base the similarity on the word mover distance but the implementation is way to slow so I dumped it (took 1:18 for a single prediction...)

In [5]:
class WM_KNN():
    def __init__(self, n_neighbors, model) -> None:
        self.n_neighbors = n_neighbors
        self.model = model

    def fit(self, X, y):
        # save the raw sentences and labels
        self.X = X
        self.y = y
    
    def predict(self, X_test):
        predictions = []
        for claim in tqdm(X_test):
            dists = []
            for train_claim in self.X:
                dists.append(self.model.wmdistance(claim, train_claim))
            
            # decision
            n_closest = np.argsort(np.array(dists))[:self.n_neighbors]
            predictions.append(pd.Series(self.y[n_closest]).value_counts().index[0])
        return np.array(predictions)

In [106]:
cosine_knn = Cosine_KNN(n_neighbors=17)
cosine_knn.fit(X_glove, y)
predictions = cosine_knn.predict(X_glove_val)
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

       FALSE       0.81      0.92      0.86      2119
       OTHER       0.16      0.06      0.09       396
        TRUE       0.05      0.02      0.03       110

    accuracy                           0.76      2625
   macro avg       0.34      0.33      0.33      2625
weighted avg       0.68      0.76      0.71      2625



## Eval on test set

In [15]:
test_set = pd.read_csv("../../../data/test_set.csv", index_col=0)
test_set.shape

(1680, 3)

In [16]:
X_test = test_set["claim"].values
y_test = test_set["label"].values

In [97]:
X_test_ = np.vstack([vectorize_claim(claim) for claim in X_test])

In [17]:
X_test_glove = np.vstack([retrieve_wv_average(claim) for claim in X_test])
X_test_glove.shape


(1680, 100)

In [107]:
clf = cosine_knn
test_predictions = clf.predict(X_test_glove)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

              precision    recall  f1-score   support

       FALSE       0.42      0.93      0.58       700
     NEITHER       0.42      0.06      0.11       679
        TRUE       0.33      0.04      0.07       301

    accuracy                           0.42      1680
   macro avg       0.39      0.34      0.25      1680
weighted avg       0.40      0.42      0.30      1680



FALSE      1542
NEITHER     105
TRUE         33
dtype: int64