# Baseline TFIDF + PassiveAggressive Classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import joblib
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

In [3]:
claims = pd.read_csv("../../../data/preprocessed_claims.csv", index_col=0, sep=",")
claims.shape

(18897, 5)

In [4]:
train_idx = claims["date"].apply(lambda x : x.split("-")[0]) != "2022"
val_idx = claims["date"].apply(lambda x : x.split("-")[0]) == "2022"

In [5]:
# double check
"2022" in claims[train_idx]["date"].apply(lambda x: x.split("-")[0]).value_counts().index

False

Get the training data (use either the training_index or the augmented data set)

In [5]:
X = claims[train_idx]["claim"].values

In [6]:
y = claims[train_idx]["truth_rating"].values

In [7]:
X.shape, y.shape

((17202,), (17202,))

In [7]:
# for augmented data
aug_claims = pd.read_csv("../../../data/augmented_claims.csv")
X = aug_claims["claim"].values
y = aug_claims["truth_rating"].values
print(X.shape, y.shape)

(26895,) (26895,)


#### balance data

In [8]:
Seed = 417

In [51]:
training_set = claims[train_idx]

In [53]:
training_set.truth_rating.value_counts()

FALSE    9887
OTHER    4312
TRUE     3003
Name: truth_rating, dtype: int64

In [70]:
false_claims = training_set[training_set.truth_rating == "FALSE"].sample(3003, replace=False, random_state=Seed)
other_claims = training_set[training_set.truth_rating == "OTHER"].sample(3003, replace=False, random_state=Seed)
true_claims = training_set[training_set.truth_rating == "TRUE"]

In [71]:
balanced_train = pd.concat([true_claims, other_claims, false_claims], axis=0).sample(frac=1.0, replace=False, random_state=Seed)

In [72]:
balanced_train.head()

Unnamed: 0,ID,claim,date,truth_rating,n_token
6209,http://data.gesis.org/claimskg/creative_work/5...,Photo shows Roh Moo-hyun making heart gesture ...,2021-09-10,OTHER,12
15131,http://data.gesis.org/claimskg/creative_work/d...,The Centre for Social Justice recently produce...,2021-12-21,OTHER,29
4427,http://data.gesis.org/claimskg/creative_work/4...,Photographs show Fox News host Chris Wallace v...,2020-10-05,FALSE,17
12598,http://data.gesis.org/claimskg/creative_work/b...,A picture shows a woman holding a protest sign...,2019-03-28,OTHER,22
5453,http://data.gesis.org/claimskg/creative_work/4...,The website Loser.com redirected to U.S. Presi...,2020-11-11,TRUE,18


In [73]:
X = balanced_train["claim"].values
y = balanced_train["truth_rating"].values

### Train the classifier

In [8]:
vec = TfidfVectorizer()
vec.fit(X)

In [9]:
X_ = vec.transform(X)

In [10]:
X_.shape

(26895, 22994)

In [108]:
paclf = PassiveAggressiveClassifier(random_state=417, n_jobs=-1, C=0.0001)

In [187]:
paclf.fit(X_, y)

### Eval model on val

In [188]:
# first in training
training_predictions = paclf.predict(X_)

In [189]:
print(classification_report(y_true= y, y_pred= training_predictions))

              precision    recall  f1-score   support

       FALSE       1.00      1.00      1.00      9887
       OTHER       1.00      1.00      1.00      8414
        TRUE       1.00      1.00      1.00      8594

    accuracy                           1.00     26895
   macro avg       1.00      1.00      1.00     26895
weighted avg       1.00      1.00      1.00     26895



In [190]:
# now with validation data
X_val = claims[val_idx]["claim"].values
y_val = claims[val_idx]["truth_rating"].values

In [191]:
X_val_ = vec.transform(X_val)

In [192]:
val_predictions = paclf.predict(X_val_)

In [193]:
# looks good to me
print(classification_report(y_true= y_val, y_pred= val_predictions))

              precision    recall  f1-score   support

       FALSE       0.87      0.68      0.76      1256
       OTHER       0.45      0.62      0.52       367
        TRUE       0.16      0.46      0.24        72

    accuracy                           0.66      1695
   macro avg       0.49      0.59      0.51      1695
weighted avg       0.75      0.66      0.69      1695



In [194]:
pd.Series(val_predictions).value_counts()

FALSE    988
OTHER    502
TRUE     205
dtype: int64

### Save the model

In [202]:
joblib.dump(vec, "../../../data/vectorizer_v1.pkl")
joblib.dump(paclf, "../../../data/model_v1.pkl")

['../../../data/model_v1.pkl']

In [203]:
pd.read_csv("../../../data/raw_test_claims.csv")

Unnamed: 0.1,Unnamed: 0,ID,claim
0,0,http://data.gesis.org/claimskg/creative_work/d...,A TikTok video shows a March 2022 school walko...
1,1,http://data.gesis.org/claimskg/creative_work/3...,Mattel sent Barbie dolls to the International ...
2,2,http://data.gesis.org/claimskg/creative_work/2...,'No one visiting Disney can get in” because of...
3,3,http://data.gesis.org/claimskg/creative_work/f...,Two years ago we were “drilling our own oil fo...
4,4,http://data.gesis.org/claimskg/creative_work/5...,"Families could suffer a £2,000-a-year average ..."
...,...,...,...
1675,1675,http://data.gesis.org/claimskg/creative_work/4...,"'Goodbye' originated with 'god be with ye,' ab..."
1676,1676,http://data.gesis.org/claimskg/creative_work/9...,A map shows the potential sites of Russian nuc...
1677,1677,http://data.gesis.org/claimskg/creative_work/2...,Did North Dakota Senator Kevin Cramer Say Savi...
1678,1678,http://data.gesis.org/claimskg/creative_work/d...,Goat’s milk is a good substitute for baby form...


# laod test

In [195]:
test_set = pd.read_csv("test_set.csv", index_col=0)
test_set.shape

(1680, 3)

In [196]:
X_test = test_set["claim"].values
y_test = test_set["label"].values

In [197]:
X_test_ = vec.transform(X_test)

In [198]:
test_predictions = paclf.predict(X_test_)

In [199]:
final_predictions = np.array(["NEITHER" if l == "OTHER" else l for l in test_predictions])

In [200]:
pd.Series(final_predictions).value_counts()

FALSE      747
NEITHER    499
TRUE       434
dtype: int64

In [201]:
print(classification_report(y_true= y_test, y_pred= final_predictions))

              precision    recall  f1-score   support

       FALSE       0.49      0.52      0.50       700
     NEITHER       0.49      0.36      0.41       679
        TRUE       0.32      0.46      0.37       301

    accuracy                           0.44      1680
   macro avg       0.43      0.44      0.43      1680
weighted avg       0.46      0.44      0.44      1680

