# Scikit-Learn Classifier on TFIDF Features

In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [34]:
claims = pd.read_csv("../../../data/preprocessed_claims_new.csv", index_col=0, sep=",")
claims.shape

(40608, 5)

In [35]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

Get the training data (use either the training_index or the augmented data set)

In [36]:
X = claims[train_idx]["claim"].values
y = claims[train_idx]["truth_rating"].values
# validation data
X_val = claims[val_idx]["claim"].values
y_val = claims[val_idx]["truth_rating"].values
X.shape, y.shape, X_val.shape, y_val.shape

((37983,), (37983,), (2625,), (2625,))

In [5]:
# for augmented data
aug_claims = pd.read_csv("../../../data/augmented_claims.csv")
X = aug_claims["claim"].values
y = aug_claims["truth_rating"].values
print(X.shape, y.shape)

(26895,) (26895,)


#### Create naively balanced training data

In [6]:
Seed = 417
training_set = claims[train_idx]
label_dist = training_set.truth_rating.value_counts()
label_dist

FALSE    21565
OTHER     9891
TRUE      6527
Name: truth_rating, dtype: int64

In [7]:
smallest_class, min_N = label_dist.index[-1], label_dist.values[-1]
smallest_class, min_N

('TRUE', 6527)

In [8]:
false_claims = training_set[training_set.truth_rating == "FALSE"].sample(min_N, replace=False, random_state=Seed)
other_claims = training_set[training_set.truth_rating == "OTHER"].sample(min_N, replace=False, random_state=Seed)
true_claims = training_set[training_set.truth_rating == "TRUE"]

In [9]:
# merge and reshuffle claims
balanced_train = pd.concat([true_claims, other_claims, false_claims], axis=0).sample(frac=1.0, replace=False, random_state=Seed)

In [10]:
balanced_train.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
9286,http://data.gesis.org/claimskg/creative_work/3...,"Muslim immigrants were responsible for 11,000 ...",2019-02-04,FALSE,17
36020,http://data.gesis.org/claimskg/creative_work/e...,Betsy DeVos and her family contributed million...,2017-02-08,TRUE,15
20651,http://data.gesis.org/claimskg/creative_work/8...,'Germany owes ... vast sums of money to NATO &...,2017-03-18,FALSE,28
39537,http://data.gesis.org/claimskg/creative_work/4...,Army protects protesters from police in Poland,2021-04-03,OTHER,7
29853,http://data.gesis.org/claimskg/creative_work/c...,A 14-year-old Texas girl became pregnant due t...,2015-10-07,OTHER,11


In [11]:
X_bal = balanced_train["claim"].values
y_bal = balanced_train["truth_rating"].values

## Prepare Features

In [37]:
vec = TfidfVectorizer()
vec.fit(X)

In [38]:
X_ = vec.transform(X)
# validation data
X_val_ = vec.transform(X_val)
X_.shape, X_val_.shape

((37983, 28668), (2625, 28668))

### Apply dimensionality reduction (optional)
Enable to fit a MLP classifier

In [7]:
svd = TruncatedSVD(n_components=1000, random_state=417)

In [8]:
X_svd = svd.fit_transform(X_)
X_svd.shape

(37983, 1000)

In [9]:
X_val_svd = svd.transform(X_val_)
X_val_svd.shape

(2625, 1000)

### Fit the model

In [39]:
def train_eval(clf, X_train, y_train, X_val, y_val):
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    print(classification_report(y_true=y_val,y_pred=preds))

In [63]:
paclf = PassiveAggressiveClassifier(random_state=417, n_jobs=-1, C=0.01)
knn = KNeighborsClassifier(n_neighbors=6, weights="distance") 
rfc = RandomForestClassifier(random_state=417, n_jobs=-1, criterion="entropy")
sgd = SGDClassifier(random_state=417, n_jobs=-1, loss="squared_error")
mlp = MLPClassifier(random_state=417)

In [64]:
train_eval(sgd, X_, y, X_val_, y_val)

              precision    recall  f1-score   support

       FALSE       0.86      0.94      0.90      2119
       OTHER       0.52      0.33      0.40       396
        TRUE       0.33      0.16      0.22       110

    accuracy                           0.82      2625
   macro avg       0.57      0.48      0.51      2625
weighted avg       0.79      0.82      0.79      2625



In [None]:
ps = [2, 1.5, 1., 0.75, 0.5, 0.25, 0.1, 0.01, 0.001]
for p in ps:
    clf = PassiveAggressiveClassifier(n_jobs=-1, random_state=417, C=p)
    print(clf)
    train_eval(clf, X_svd, y, X_val_svd, y_val)
    print("_"*100)

### Save the model

In [477]:
#joblib.dump(vec, "../../../data/vectorizer_v1.pkl")
#joblib.dump(paclf, "../../../data/model_v1.pkl")

# Eval on test

In [42]:
test_set = pd.read_csv("../../../data/test_set.py", index_col=0)
test_set.shape

(1680, 3)

In [43]:
X_test = test_set["claim"].values
y_test = test_set["label"].values

In [44]:
X_test_ = vec.transform(X_test)
#X_test_svd = svd.transform(X_test_)
X_test_.shape #, X_test_svd.shape

(1680, 28668)

In [65]:
clf = sgd
test_predictions = clf.predict(X_test_)
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])
print(classification_report(y_true= y_test, y_pred=final_predictions))
pd.Series(final_predictions).value_counts()

              precision    recall  f1-score   support

       FALSE       0.48      0.93      0.63       700
     NEITHER       0.63      0.20      0.30       679
        TRUE       0.39      0.14      0.21       301

    accuracy                           0.49      1680
   macro avg       0.50      0.42      0.38      1680
weighted avg       0.52      0.49      0.42      1680



FALSE      1354
NEITHER     217
TRUE        109
dtype: int64