# Model Trained on Low-dimensional Handcrafted Features

In [3]:
import numpy as np
import pandas as pd
import gensim
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import text2emotion as te
from sklearn.metrics import classification_report
from nrclex import NRCLex
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm
tqdm.pandas()

nltk.download("vader_lexicon")
nltk.download("punkt")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nickr\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nickr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load data

In [4]:
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0)

## Feature extraction function

In [5]:
def extract_features(claim, sid):
    # fast
    sentiment_score = np.fromiter(sid.polarity_scores(claim).values(), dtype=np.float64)
    # this is quite slow
    emotion_score = np.fromiter(te.get_emotion(claim).values(), dtype=np.float64)
    #fast
    emotion_strength = 1 / (len(NRCLex(claim).raw_emotion_scores) / 10 + 1e-8)
    #fast
    tokenized_claim = word_tokenize(claim)
    N = len(tokenized_claim)
    
    num = 0.0
    excl = 0.0
    qest = 0.0
    ent = 0.0
    
    for token in tokenized_claim:
        if token.isdigit():
            num += 1.0
        if re.search(r"\!", token, re.DOTALL) is not None:
            excl += 1.0
        if re.search(r"\?", token, re.DOTALL) is not None:
            qest += 1.0
        if re.search(r"[A-Z]", token, re.DOTALL) is not None:
            ent += 1.0
    
    word_scores = np.array([num, excl, qest, ent], dtype=np.float64) / N
    
    features = np.hstack((sentiment_score, emotion_score, emotion_strength, word_scores, np.array(N, dtype=np.float64)))
    
    return features

In [4]:
sid = SentimentIntensityAnalyzer()

In [5]:
extracted_features_demo = claims["claim"].iloc[:100].progress_apply(lambda x: extract_features(x, sid))

100%|██████████| 100/100 [00:43<00:00,  2.30it/s]


In [6]:
X = np.stack(extracted_features_demo.values, axis=0)
X.shape

(100, 15)

In [7]:
# for convenience
#np.save("extracted_features_demo", X)

## Train a model on extracted features

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [7]:
# load
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0)
augmented_claims = pd.read_csv("../../../data/augmented_claims.csv", index_col=0)
extracted_features = np.load("../../../data/extracted_features_preprocessed_claims_new.npy")
augmented_extracted_features = np.load("../../../data/extracted_features_augmented_claims.npy")
claims.shape, extracted_features.shape, augmented_claims.shape, augmented_extracted_features.shape

((40608, 5), (40608, 15), (26895, 5), (26895, 15))

In [8]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

In [9]:
X = extracted_features[train_idx]
y = claims[train_idx]["truth_rating"].values
X_val = extracted_features[val_idx]
y_val = claims[val_idx]["truth_rating"].values
X.shape, y.shape, X_val.shape, y_val.shape

((37983, 15), (37983,), (2625, 15), (2625,))

#### For augmented data (all augmented claims are training data)

In [13]:
X_aug = augmented_extracted_features
y_aug = augmented_claims["truth_rating"].values
X_aug.shape , y_aug.shape

((26895, 15), (26895,))

#### Define some helper functions

In [10]:
def train_eval(clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    print(classification_report(y_true=y_val,y_pred=preds))

In [11]:
def train_eval_ensemble(clfs, labels, X_train, y_train, X_val, y_val, voting="hard"):
    input = [(label, clf) for label, clf in zip(labels, clfs)]
    clf = VotingClassifier(estimators= input, voting=voting, n_jobs=-1)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    print(classification_report(y_true=y_val,y_pred=preds))

In [63]:
knn = KNeighborsClassifier(n_neighbors=45, n_jobs=-1) # n_neighbors 45 -> 45
rfc = RandomForestClassifier(random_state=417, min_samples_leaf=3, n_jobs=-1)
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), random_state=417, max_iter=500)
#paclf = PassiveAggressiveClassifier(random_state=417, C=0.001, n_jobs=-1)
sgd = SGDClassifier(loss="hinge", random_state=417, n_jobs=-1)
dtc = DecisionTreeClassifier(random_state=417, criterion="entropy", min_samples_leaf=30)

In [64]:
train_eval(dtc, X, y, X_val, y_val)

              precision    recall  f1-score   support

       FALSE       0.83      0.90      0.86      2119
       OTHER       0.25      0.15      0.19       396
        TRUE       0.14      0.11      0.12       110

    accuracy                           0.76      2625
   macro avg       0.41      0.39      0.39      2625
weighted avg       0.71      0.76      0.73      2625



In [82]:
ps = [17, 18, 19, 20, 25, 30]
for p in ps:
    clf = DecisionTreeClassifier(random_state=417, criterion="entropy", min_samples_leaf=p)
    print(clf)
    train_eval(clf, X, y, X_val, y_val)
    print("_" *100)

DecisionTreeClassifier(criterion='entropy', min_samples_leaf=17,
                       random_state=417)
              precision    recall  f1-score   support

       FALSE       0.77      0.89      0.83      1256
       OTHER       0.36      0.16      0.22       367
        TRUE       0.24      0.28      0.26        72

    accuracy                           0.71      1695
   macro avg       0.46      0.44      0.44      1695
weighted avg       0.66      0.71      0.67      1695

____________________________________________________________________________________________________
DecisionTreeClassifier(criterion='entropy', min_samples_leaf=18,
                       random_state=417)
              precision    recall  f1-score   support

       FALSE       0.77      0.89      0.83      1256
       OTHER       0.38      0.17      0.23       367
        TRUE       0.24      0.28      0.26        72

    accuracy                           0.71      1695
   macro avg       0.47      0.45 

In [23]:
train_eval_ensemble([rfc, dtc], ["1", "2"], X, y, X_val, y_val, voting="soft")

              precision    recall  f1-score   support

       FALSE       0.83      0.92      0.88      2119
       OTHER       0.28      0.15      0.20       396
        TRUE       0.12      0.06      0.08       110

    accuracy                           0.77      2625
   macro avg       0.41      0.38      0.39      2625
weighted avg       0.72      0.77      0.74      2625



In [24]:
vtn = VotingClassifier(estimators= [("1", rfc), ("2", dtc)], voting="soft", n_jobs=-1)
vtn.fit(X, y)

### Eval on Test set

In [12]:
extracted_features_test = np.load("../../../data/extracted_features_test.npy")
test_set = pd.read_csv("../../../data/test_set.csv", index_col=0)
test_set.shape, extracted_features_test.shape

((1680, 3), (1680, 15))

In [13]:
X_test = extracted_features_test
y_test = test_set["label"].values
X_test.shape, y_test.shape

((1680, 15), (1680,))

In [65]:
clf = dtc
test_predictions = clf.predict(X_test)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

              precision    recall  f1-score   support

       FALSE       0.46      0.90      0.61       700
     NEITHER       0.59      0.20      0.29       679
        TRUE       0.36      0.11      0.17       301

    accuracy                           0.48      1680
   macro avg       0.47      0.40      0.36      1680
weighted avg       0.50      0.48      0.40      1680



FALSE      1362
NEITHER     226
TRUE         92
dtype: int64

### Bonus: Feature importance 

In [62]:
dtc.feature_importances_

array([0.05205377, 0.06632929, 0.021511  , 0.05191646, 0.00439509,
       0.00693073, 0.03425165, 0.03262334, 0.04198849, 0.05775555,
       0.06107909, 0.00227528, 0.07133161, 0.32528313, 0.17027553])