## Set Up

In [1]:
import os
os.chdir('../..')
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from util.dataloader import DataLoader
from util.datasplitter import data_splitter
from preprocessing.preprocessor import Preprocessor
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from evaluator import evaluate_classifier, get_summary_dataset
from nltk import download
#Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

#Load linguistic resources
download('stopwords', quiet=True)
download('omw-1.4', quiet=True)
download('punkt', quiet=True)
download('wordnet', quiet=True)

True

In [2]:
SEED = 42

## Prepare data

In [3]:
#Load data 
dl = DataLoader(['fake_news'])
data = dl.load()

100%|███████████████████████████████████████████████████████████████████████████████| 624/624 [00:05<00:00, 119.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 432/432 [00:03<00:00, 119.87it/s]
100%|███████████████████████████████████████████████████████████████████████████| 13267/13267 [00:43<00:00, 306.55it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5323/5323 [00:06<00:00, 835.35it/s]


In [4]:
#Initialize preprocessors
preprocessor = Preprocessor() #Preprocessor for standard text
tweet_preprocessor = Preprocessor(is_tweet=True) #Preprocessor for tweets

In [5]:
train_politifact, _ , test_politifact = data_splitter(data['politifact'],
                                                     preprocessor,
                                                     create_val_set=True,
                                                     seed=SEED)
# train_gossipcop, _ , test_gossipcop = data_splitter(data['gossipcop'],
#                                                      preprocessor,
#                                                      create_val_set=True,
#                                                      seed=SEED)
train_liar, _ , test_liar = data_splitter(data['liar'],
                                                     preprocessor,
                                                     create_val_set=True,
                                                     seed=SEED)

1056 rows preprocessed in 12.166864156723022 seconds
10269 rows preprocessed in 2.6843976974487305 seconds
1283 rows preprocessed in 0.31337594985961914 seconds
1284 rows preprocessed in 0.30694580078125 seconds


In [6]:
%%time
#Load fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

Wall time: 38.9 s




In [7]:
#Generate embeddings
embedded_train_politifact = fasttext.generate_sentence_embeddings(train_politifact['text'])
embedded_test_politifact = fasttext.generate_sentence_embeddings(test_politifact['text'])
embedded_train_politifact['label'] = train_politifact['label'].to_list()
embedded_test_politifact['label'] = test_politifact['label'].to_list()
# embedded_train_gossipcop = fasttext.generate_sentence_embeddings(train_gossipcop['text'])
# embedded_test_gossipcop = fasttext.generate_sentence_embeddings(test_gossipcop['text'])
# embedded_train_gossipcop['label'] = train_gossipcop['label'].to_list()
# embedded_test_gossipcop['label'] = test_gossipcop['label'].to_list()
embedded_train_liar = fasttext.generate_sentence_embeddings(train_liar['text'])
embedded_test_liar = fasttext.generate_sentence_embeddings(test_liar['text'])
embedded_train_liar['label'] = train_liar['label'].to_list()
embedded_test_liar['label'] = test_liar['label'].to_list()

starting to generate sentence embeddings


100%|████████████████████████████████████████████████████████████████████████████████| 675/675 [00:31<00:00, 21.28it/s]


starting to generate sentence embeddings


100%|████████████████████████████████████████████████████████████████████████████████| 212/212 [00:10<00:00, 20.92it/s]


starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 10269/10269 [00:17<00:00, 589.76it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 1283/1283 [00:05<00:00, 225.73it/s]


## Evaluate

Note : fillna(0) is used when the text is represented by an empty string and its fasttext vector is therefore undefined.
Should we change that?

## Save data

In [10]:
models_politifact = {'tfidf_lr':LogisticRegression(C=3.7,random_state=SEED),
                     'ft_lr':LogisticRegression(C=9.626,random_state=SEED),
                     'tfidf_rf':RandomForestClassifier(random_state=SEED),
                     'ft_rf':RandomForestClassifier(random_state=SEED)}
models_gossipcop = {'tfidf_lr':LogisticRegression(C=8.959,random_state=SEED),
                     'ft_lr':LogisticRegression(C=7.346,random_state=SEED),
                     'tfidf_rf':RandomForestClassifier(random_state=SEED),
                     'ft_rf':RandomForestClassifier(random_state=SEED)}
models_liar = {'tfidf_lr':LogisticRegression(C=4.376,random_state=SEED),
                     'ft_lr':LogisticRegression(C=7.82,random_state=SEED),
                     'tfidf_rf':RandomForestClassifier(random_state=SEED),
                     'ft_rf':RandomForestClassifier(random_state=SEED)}


_, _ = get_summary_dataset('politifact',train_politifact,test_politifact,
                           embedded_train_politifact,embedded_test_politifact,models_politifact)
_, _ = get_summary_dataset('gossipcop',train_gossipcop,test_gossipcop,
                           embedded_train_gossipcop,embedded_test_gossipcop,models_gossipcop)
_, _ = get_summary_dataset('liar',train_liar,test_liar,
                           embedded_train_liar,embedded_test_liar,models_liar)

(   tfidf_lr     ft_lr  tfidf_rf     ft_rf
 0  0.830189  0.830189  0.858491  0.849057
 1  0.794839  0.808048  0.836336  0.831746
 2  0.946790  0.904190  0.926944  0.910286
 3  0.928148  0.873039  0.901933  0.880884,
       label  tfidf_lr  ft_lr  tfidf_rf  ft_rf
 260       1         1      1         1      1
 832       0         0      1         0      1
 846       0         0      0         0      0
 1007      0         1      1         1      1
 88        1         1      1         1      1
 ...     ...       ...    ...       ...    ...
 72        1         1      1         1      1
 872       0         0      0         0      0
 44        1         1      1         1      1
 305       1         1      1         1      1
 321       1         1      1         1      1
 
 [212 rows x 5 columns])