# Costum Classifier on Textual Features

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nickr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Data

In [2]:
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0)
claims.shape

(40608, 5)

In [3]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

In [4]:
X = claims[train_idx]["claim"].values
y = claims[train_idx]["truth_rating"].values
# validation data
X_val = claims[val_idx]["claim"].values
y_val = claims[val_idx]["truth_rating"].values
X.shape, y.shape, X_val.shape, y_val.shape

((37983,), (37983,), (2625,), (2625,))

In [5]:
STOPWORDS = nltk.corpus.stopwords.words("english")

### Own models

In [6]:
def clean_claims(claim, stopwords):
    # works better than gensim's tokenize()
    claim = word_tokenize(claim)
    
    filtered_claim = []
    
    for w in claim:
        if w.lower() not in stopwords:
            filtered_claim.append(w)
    
    return " ".join(filtered_claim)

In [7]:
def clean_and_tokenize(claim, stopwords):
    
    claim = word_tokenize(claim)
    
    filtered_claim = []
    
    for w in claim:
        if w.lower() not in stopwords:
            filtered_claim.append(w.lower())
    
    return filtered_claim

In [8]:
class N_words_clf:
    def __init__(self, top_N=100, stopwords=None):
        self.N = top_N
        self.stopwords = stopwords
    
    def clean_and_tokenize(self, claim):
            
        claim = word_tokenize(claim)
        
        if self.stopwords:
            filtered_claim = []
            for w in claim:
                if w.lower() not in self.stopwords:
                    filtered_claim.append(w.lower())
            return filtered_claim
        else:
            return claim 
    
    def fit(self, X_train, y_train):
        labels = np.unique(y_train)
        profiles = {}
        for label in labels:
            subsample = X_train[y_train == label]
            profile = pd.Series(np.hstack([self.clean_and_tokenize(claim) for claim in subsample])).value_counts().index
            # save to profiles
            profiles[label] = profile
            
        self.profiles = profiles
                
        
    def predict(self, X_test):
        profiles = self.profiles
        
        predictions = []
        for claim in tqdm(X_test):
            claim_profile = word_tokenize(claim)
            
            best_score = -1
            best_label = list(profiles.keys())[0]
            
            for label, profile in profiles.items():
                score = np.array([w in profile[:self.N] for w in claim_profile]).sum()
                if score > best_score:
                    best_score = score
                    best_label = label
            
            predictions.append(best_label)    
        return np.array(predictions)

In [26]:
clf_w_stopwords = N_words_clf(top_N=10, stopwords=STOPWORDS)

In [27]:
clf_w_stopwords.fit(X, y)

In [28]:
predictions = clf_w_stopwords.predict(X_val)
predictions.shape

100%|██████████| 2625/2625 [00:04<00:00, 636.38it/s]


(2625,)

In [34]:
clf_wo_stopwords = N_words_clf(top_N = 10)
clf_wo_stopwords.fit(X, y)
predictions = clf_wo_stopwords.predict(X_val)
predictions.shape

100%|██████████| 2625/2625 [00:03<00:00, 794.03it/s]


(2625,)

In [None]:
ps = [10, 100, 1000, 10000, 100000]
for p in ps:
    print(f"WORD_MODEL with top_N = {p}")
    clf_wo_stopwords = N_words_clf(top_N=p)
    clf_wo_stopwords.fit(X, y)
    predictions = clf_wo_stopwords.predict(X_val)
    print(classification_report(y_true=y_val, y_pred=predictions))
    print("_"*100)

## Eval on test

In [9]:
test_set = pd.read_csv("../../../data/test_set.csv", index_col=0)
test_set.shape

(1680, 3)

In [10]:
X_test = test_set["claim"].values
y_test = test_set["label"].values

In [18]:
best_n_word_clf = N_words_clf(top_N=100)
best_n_word_clf.fit(X, y)

In [19]:
clf = best_n_word_clf
test_predictions = clf.predict(X_test)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

100%|██████████| 1680/1680 [08:08<00:00,  3.44it/s]

              precision    recall  f1-score   support

       FALSE       0.44      0.92      0.60       700
     NEITHER       0.52      0.11      0.18       679
        TRUE       0.41      0.10      0.16       301

    accuracy                           0.45      1680
   macro avg       0.45      0.38      0.31      1680
weighted avg       0.47      0.45      0.35      1680






FALSE      1461
NEITHER     145
TRUE         74
dtype: int64