# Costum & Scikit-Learn Classifier on Combined Claim Embeddings

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import joblib
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nickr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load data

In [2]:
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0)
claims.shape

(40608, 5)

In [3]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

In [4]:
X = claims[train_idx]["claim"].values
y = claims[train_idx]["truth_rating"].values
# validation data
X_val = claims[val_idx]["claim"].values
y_val = claims[val_idx]["truth_rating"].values
X.shape, y.shape, X_val.shape, y_val.shape

((37983,), (37983,), (2625,), (2625,))

## N_words_clf + Sentiment & Emotion features

In [106]:
class N_words_clf:
    def __init__(self, top_N=100, stopwords=None):
        self.N = top_N
        self.stopwords = stopwords
    
    def clean_and_tokenize(self, claim):
            
        claim = word_tokenize(claim)
        
        if self.stopwords:
            filtered_claim = []
            for w in claim:
                if w.lower() not in self.stopwords:
                    filtered_claim.append(w.lower())
            return filtered_claim
        else:
            return claim 
    
    def fit(self, X_train, y_train):
        labels = np.unique(y_train)
        profiles = {}
        for label in labels:
            subsample = X_train[y_train == label]
            profile = pd.Series(np.hstack([self.clean_and_tokenize(claim) for claim in subsample])).value_counts().index
            # save to profiles
            profiles[label] = profile
            
        self.profiles = profiles
                
        
    def predict(self, X_test):
        profiles = self.profiles
        
        predictions = []
        for claim in tqdm(X_test):
            claim_profile = word_tokenize(claim)
            
            best_score = -1
            best_label = list(profiles.keys())[0]
            
            for label, profile in profiles.items():
                score = np.array([w in profile[:self.N] for w in claim_profile]).sum()
                if score > best_score:
                    best_score = score
                    best_label = label
            
            predictions.append(best_label)    
        return np.array(predictions)

In [107]:
extracted_features = np.load("../../../data/extracted_features_preprocessed_claims_new.npy")
extracted_features.shape

(40608, 15)

In [108]:
STOPWORDS = nltk.corpus.stopwords.words("english")

In [109]:
def add_features_to_claims(claims, features):
    new_claims = []
    for i, claim in enumerate(claims):
        added_text = []
        feature = features[i]
        
        if feature[0] > 0:
            added_text.append("negative")
        if feature[1] > 0:
            added_text.append("neutral")
        if feature[2] > 0:
            added_text.append("positive")
        if feature[3] > 0.05:
            added_text.append("positive_compound")
        if feature[3] < -0.05:
            added_text.append("negative compound")
        if feature[4] > 0:
            added_text.append("angry")
        if feature[5] > 0:
            added_text.append("anxious")
        if feature[6] > 0:
            added_text.append("happy")
        if feature[7] > 0:
            added_text.append("sad")
        if feature[8] > 0:
            added_text.append("surprised")
        if feature[9] < 2.5:
            added_text.append("specific_emotion")
        if feature[9] >= 2.5 and feature[9] < 10.0:
            added_text.append("emotional")
        if feature[9] > 10.0:
            added_text.append("no_specific_emotion")
        
        sentiment_features = " ".join(added_text)
        
        new_claims.append(claim + " " + sentiment_features)
    
    return np.array(new_claims)

In [110]:
X_text = add_features_to_claims(X, extracted_features[train_idx])
X_text_val = add_features_to_claims(X_val, extracted_features[val_idx])

In [112]:
clf_w_stopwords = N_words_clf(top_N=1000, stopwords=STOPWORDS)

In [113]:
clf_w_stopwords.fit(X_text, y)

In [114]:
predictions = clf_w_stopwords.predict(X_text_val)

100%|██████████| 2625/2625 [00:14<00:00, 187.42it/s]


In [115]:
print(classification_report(y_true=y_val, y_pred=predictions))

              precision    recall  f1-score   support

       FALSE       0.85      0.87      0.86      2119
       OTHER       0.34      0.27      0.30       396
        TRUE       0.15      0.21      0.18       110

    accuracy                           0.75      2625
   macro avg       0.45      0.45      0.45      2625
weighted avg       0.75      0.75      0.75      2625



## Eval on test set

In [5]:
test_set = pd.read_csv("test_set.csv", index_col=0)
extracted_features_test = np.load("../../../data/extracted_features_test.npy")
test_set.shape, extracted_features_test.shape

((1680, 3), (1680, 15))

In [6]:
X_test = test_set["claim"].values
y_test = test_set["label"].values

In [51]:
X_test_ = add_features_to_claims(X_test, extracted_features_test)

array(["A TikTok video shows a March 2022 school walkout in protest of Florida's 'Don't Say Gay' bill. negative neutral negative compound angry sad emotional",
       'Mattel sent Barbie dolls to the International Space Station (ISS) in 2022, as part of its efforts to encourage more young people to go into STEM-related fields. neutral positive positive_compound surprised specific_emotion',
       "'No one visiting Disney can get in” because of protest. negative neutral negative compound no_specific_emotion",
       'Two years ago we were “drilling our own oil for $27 a barrel.” Now, thanks to Joe Biden, we’re “paying $105 a barrel to Russia.” neutral positive positive_compound no_specific_emotion',
       'Families could suffer a £2,000-a-year average hit from the cost of living crisis. negative neutral negative compound anxious happy surprised emotional'],
      dtype='<U595')

In [64]:
best_n_word_clf = N_words_clf(top_N=100, stopwords=STOPWORDS)
best_n_word_clf.fit(X_, y)

In [65]:
clf = best_n_word_clf
test_predictions = clf.predict(X_test_)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

100%|██████████| 1680/1680 [02:52<00:00,  9.74it/s]

              precision    recall  f1-score   support

       FALSE       0.47      0.90      0.62       700
     NEITHER       0.58      0.20      0.30       679
        TRUE       0.38      0.14      0.21       301

    accuracy                           0.48      1680
   macro avg       0.48      0.41      0.38      1680
weighted avg       0.50      0.48      0.42      1680






FALSE      1334
NEITHER     234
TRUE        112
dtype: int64

## Stack TFIDF + Doc2vec + glove + extracted features

If the loaded models are not present, please create them using `model_v2.ipynb`

In [7]:
# load
doc2vec = gensim.models.doc2vec.Doc2Vec.load("doc2vec_model_100d_min3_100e.gensim")

2023-01-14 13:43:26,520 : INFO : loading Doc2Vec object from doc2vec_model_100d_min3_100e.gensim
2023-01-14 13:43:26,574 : INFO : loading dv recursively from doc2vec_model_100d_min3_100e.gensim.dv.* with mmap=None
2023-01-14 13:43:26,576 : INFO : loading wv recursively from doc2vec_model_100d_min3_100e.gensim.wv.* with mmap=None
2023-01-14 13:43:26,578 : INFO : setting ignored attribute cum_table to None
2023-01-14 13:43:26,917 : INFO : Doc2Vec lifecycle event {'fname': 'doc2vec_model_100d_min3_100e.gensim', 'datetime': '2023-01-14T13:43:26.917001', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


In [8]:
glove = gensim.models.keyedvectors.KeyedVectors.load("glove-twitter-100.gensim")

2023-01-14 13:43:27,253 : INFO : loading KeyedVectors object from glove-twitter-100.gensim
2023-01-14 13:43:28,616 : INFO : loading vectors from glove-twitter-100.gensim.vectors.npy with mmap=None
2023-01-14 13:43:29,536 : INFO : KeyedVectors lifecycle event {'fname': 'glove-twitter-100.gensim', 'datetime': '2023-01-14T13:43:29.536192', 'gensim': '4.3.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'loaded'}


In [9]:
extracted_features = np.load("../../../data/extracted_features_preprocessed_claims_new.npy")

#### TFIDF Features

In [10]:
vec = TfidfVectorizer()
vec.fit(X)

In [11]:
X_tfidf = vec.transform(X)
X_tfidf.shape

(37983, 28668)

In [12]:
X_tfidf_val = vec.transform(X_val)
X_tfidf_val.shape

(2625, 28668)

#### Doc2vec Features

In [13]:
def vectorize_claim(claim, model):
    tokenized_claim = gensim.utils.simple_preprocess(claim)
    wv = model.infer_vector(tokenized_claim)
    return wv

In [108]:
X_doc2vec = pd.Series(X).apply(lambda x: vectorize_claim(x, doc2vec))

In [120]:
X_doc2vec = np.vstack(X_doc2vec.values)
X_doc2vec.shape

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [127]:
X_doc2vec_val = pd.Series(X_val).apply(lambda x: vectorize_claim(x, doc2vec))

In [131]:
X_doc2vec_val = np.vstack(X_doc2vec_val.values)
X_doc2vec_val.shape

(2625, 100)

#### Glove Features

In [14]:
def get_wv(w, model):
    try:
        return model.get_vector(w)
    except:
        return np.zeros((100,))

In [15]:
def retrieve_wv_average(claim, model):
    tokenized_claim = gensim.utils.simple_preprocess(claim)
    wv = np.array([get_wv(token, model) for token in tokenized_claim]).mean(axis=0)
    # this line for fasttext
    #wv = np.array([model.wv[token] for token in tokenized_claim]).mean(axis=0)
    return wv

In [16]:
X_glove = np.vstack([retrieve_wv_average(claim, glove) for claim in X])
X_glove_val = np.vstack([retrieve_wv_average(claim, glove) for claim in X_val])
X_glove.shape, X_glove_val.shape

((37983, 100), (2625, 100))

#### Combine to final features

In [17]:
# for convinience
#X_doc2vec = np.load("doc2vec_features_train.npy")
#X_doc2vec_val = np.load("doc2vec_features_val.npy")
#X_doc2vec.shape, X_doc2vec_val.shape

((37983, 100), (2625, 100))

#### Optional Dimensionality Reduction

In [18]:
svd = TruncatedSVD(n_components=100, random_state=417)

In [19]:
svd.fit(X_tfidf)

In [20]:
X_tfidf = svd.transform(X_tfidf)
X_tfidf_val = svd.transform(X_tfidf_val)

In [21]:
# stack all features to final features
X_ = np.hstack((X_tfidf, X_doc2vec, X_glove, extracted_features[train_idx]))
X_val_ = np.hstack((X_tfidf_val, X_doc2vec_val, X_glove_val, extracted_features[val_idx]))
X_.shape, X_val_.shape

((37983, 315), (2625, 315))

In [21]:
# stack all features to final features
X_ = np.hstack((X_tfidf.toarray(), X_doc2vec, X_glove, extracted_features[train_idx]))
X_val_ = np.hstack((X_tfidf_val.toarray(), X_doc2vec_val, X_glove_val, extracted_features[val_idx]))
X_.shape, X_val_.shape

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

### Train models on final features

In [22]:
def train_eval(clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    print(classification_report(y_true=y_val,y_pred=preds))

In [20]:
from scipy import sparse
X_sparse = sparse.csr_matrix(X_)
X_sparse_val = sparse.csr_matrix(X_val_)

In [21]:
X_sparse.shape, X_sparse_val.shape

((37983, 28883), (2625, 28883))

In [23]:
paclf = PassiveAggressiveClassifier(random_state=417, n_jobs=-1, C=0.01) 
knn = KNeighborsClassifier(n_jobs=-1, n_neighbors=10) 
rfc = RandomForestClassifier(random_state=417, n_jobs=-1, criterion="entropy") 
sgd = SGDClassifier(random_state=417, n_jobs=-1, loss="hinge")
mlp = MLPClassifier(random_state=417)
dtc = DecisionTreeClassifier(random_state=417)

In [44]:
train_eval(mlp, X_, y, X_val_, y_val)

              precision    recall  f1-score   support

       FALSE       0.86      0.86      0.86      2119
       OTHER       0.35      0.33      0.34       396
        TRUE       0.24      0.28      0.26       110

    accuracy                           0.75      2625
   macro avg       0.48      0.49      0.48      2625
weighted avg       0.75      0.75      0.75      2625



In [24]:
from sklearn.ensemble import VotingClassifier

In [40]:
vtn = VotingClassifier(estimators= [("1", knn), ("3", mlp), ("4", rfc)], voting="soft", n_jobs=-1)
vtn.fit(X_, y)
pred = vtn.predict(X_val_)

In [43]:
print(classification_report(y_true=y_val, y_pred=pred))

              precision    recall  f1-score   support

       FALSE       0.85      0.94      0.89      2119
       OTHER       0.46      0.25      0.32       396
        TRUE       0.33      0.18      0.24       110

    accuracy                           0.80      2625
   macro avg       0.55      0.46      0.48      2625
weighted avg       0.77      0.80      0.78      2625



## Eval on test 

In [26]:
test_set = pd.read_csv("../../../data/test_set.csv", index_col=0)
extracted_features_test = np.load("../../../data/extracted_features_test.npy")
test_set.shape, extracted_features_test.shape

((1680, 3), (1680, 15))

In [27]:
X_tfidf_test = vec.transform(X_test)
X_doc2vec_test = pd.Series(X_test).apply(lambda x: vectorize_claim(x, doc2vec))
X_doc2vec_test = np.vstack(X_doc2vec_test.values)
X_glove_test = np.vstack([retrieve_wv_average(claim, glove) for claim in X_test])

In [28]:
X_tfidf_test = svd.transform(X_tfidf_test)

In [29]:
# combine features for x_test
X_test = np.hstack((X_tfidf_test, X_doc2vec_test, X_glove_test, extracted_features_test))

In [30]:
X_test.shape

(1680, 315)

In [45]:
clf = mlp
test_predictions = clf.predict(X_test)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

              precision    recall  f1-score   support

       FALSE       0.51      0.85      0.64       700
     NEITHER       0.59      0.29      0.39       679
        TRUE       0.35      0.21      0.27       301

    accuracy                           0.51      1680
   macro avg       0.48      0.45      0.43      1680
weighted avg       0.51      0.51      0.47      1680



FALSE      1164
NEITHER     334
TRUE        182
dtype: int64