# BOW and TFIDF with Deep Learning
In this notebooks I implemented the simple multi-layer perceptron network (Fully connected / Linear Layer) with PyTorch framework

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

from myfunctions import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from deep_pytorch import *
import joblib



In [2]:
# Read Dataset
data_full = pd.read_json('fake_news.json', lines=True)
data_full = data_full.drop(columns=['article_link']) # remove link column
df_train_f, df_test = split_dataframe(data_full, test_size=0.25, seed=1509)
df_train, df_validate = split_dataframe(df_train_f, test_size=0.2, seed=1309)

# Proportion of each subsets
list_label = df_train['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TRAINING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_validate['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== VALIDATING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_test['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TESTING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

data_train = df_train.copy()
data_train_f = df_train_f.copy()
data_val = df_validate.copy()
data_test = df_test.copy()

# Preprocessing headline
# Lower case
# Remove symbol
# Lemmatise
data_train = df_train
data_train['headline'] = data_train.headline.apply(lambda row: row.lower())
data_train['headline_s1'] = data_train.headline.apply(lambda row: remove_symbol(row))
data_train['headline_s2'] = data_train.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train['headline_s2'] = data_train.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
#data_train = data_train.drop(columns=['headline', 'headline_s1'])

data_train_f = df_train_f
data_train_f['headline'] = data_train_f.headline.apply(lambda row: row.lower())
data_train_f['headline_s1'] = data_train_f.headline.apply(lambda row: remove_symbol(row))
data_train_f['headline_s2'] = data_train_f.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train_f['headline_s2'] = data_train_f.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))

data_val = df_validate
data_val['headline'] = data_val.headline.apply(lambda row: row.lower())
data_val['headline_s1'] = data_val.headline.apply(lambda row: remove_symbol(row))
data_val['headline_s2'] = data_val.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_val['headline_s2'] = data_val.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
#data_val = data_val.drop(columns=['headline', 'headline_s1'])

data_test = df_test
data_test['headline'] = data_test.headline.apply(lambda row: row.lower())
data_test['headline_s1'] = data_test.headline.apply(lambda row: remove_symbol(row))
data_test['headline_s2'] = data_test.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_test['headline_s2'] = data_test.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))

===== TRAINING SAMPLES =====
Total Sample: 17171
Sarcastic: 8180 (47.64%)
Not Sarcastic: 8991 (52.36%)
===== VALIDATING SAMPLES =====
Total Sample: 4293
Sarcastic: 2045 (47.64%)
Not Sarcastic: 2248 (52.36%)
===== TESTING SAMPLES =====
Total Sample: 7155
Sarcastic: 3409 (47.65%)
Not Sarcastic: 3746 (52.35%)


# BOW - Raw
Use BOW on Original sentence (after discard symbol)

In [3]:
# Create BOW vectorizer and create X_train, X_val
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())
all_string = data_train.headline_s1.tolist()

# tokenize and build vocab
vectorizer.fit(all_string)

vocab = {k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}
vocab = list(vocab)

# encode document
# X_train_f = vectorizer.transform(data_train_f.headline.tolist())
# y_train_f = data_train_f.is_sarcastic.to_numpy()

X_train = vectorizer.transform(data_train.headline_s1.tolist()).toarray()
y_train = data_train.is_sarcastic.to_numpy()

X_val = vectorizer.transform(data_val.headline_s1.tolist()).toarray()
y_val = data_val.is_sarcastic.to_numpy()

# X_train, x_mean, x_std = normalize_data(X_train, [], [])
# X_val, _, _ = normalize_data(X_val, x_mean, x_std)



In [4]:
# Create Dataset data structure of PyTorch
datasetTrain = EncodingDataset(X_train, y_train)
datasetVal = EncodingDataset(X_val, y_val)

In [5]:
# Declare and train model
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint=None, model_name='BOW_Raw')
model_mlp.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 17
Epoch 1/50 [15122020-233200] [SAVE]
AccVal: 0.7176799440950384
AUCVal: 0.8919116367496454
Precision: 0.6331096291542053
Recall: 0.9687041640281677
F1Val: 0.7657518453871642
LossVal: 0.667226231098175
LossTrain: 0.6180618685834548
----------

Training 1/49
Total iteration: 17
Epoch 2/50 [15122020-233202] [SAVE]
AccVal: 0.8299557419054274
AUCVal: 0.9112187741997233
Precision: 0.7820677757263184
Recall: 0.8914425373077393
F1Val: 0.8331809812108414
LossVal: 0.6050172448158264
LossTrain: 0.44915131610982556
----------

Training 2/49
Total iteration: 17
Epoch 3/50 [15122020-233205] [SAVE]
AccVal: 0.8441649196366178
AUCVal: 0.9196655108806308
Precision: 0.834956169128418
Recall: 0.8386307954788208
F1Val: 0.8367894481903344
LossVal: 0.48506492376327515
LossTrain: 0.37431760570582223
----------

Training 3/49
Total iteration: 17
Epoch 4/50 [15122020-233207] [SAVE]
AccVal: 0.8483577917540182
AUCVal: 0.9233352765620513
Precision: 0.848848819732

Epoch 30/50 [15122020-233310]
AccVal: 0.8539482879105521
AUCVal: 0.9252273142548878
Precision: 0.8530876636505127
Recall: 0.8376528024673462
F1Val: 0.8452997802175508
LossVal: 0.38487491607666013
LossTrain: 0.11102377053569346
----------

Training 30/49
Total iteration: 17
Epoch 31/50 [15122020-233312]
AccVal: 0.853482413230841
AUCVal: 0.9250952762140104
Precision: 0.8525896668434143
Recall: 0.8371638059616089
F1Val: 0.8448062947977052
LossVal: 0.38619919419288634
LossTrain: 0.10548001308651532
----------

Training 31/49
Total iteration: 17
Epoch 32/50 [15122020-233315]
AccVal: 0.8539482879105521
AUCVal: 0.9250382845060865
Precision: 0.8530876636505127
Recall: 0.8376528024673462
F1Val: 0.8452997802175508
LossVal: 0.38732707500457764
LossTrain: 0.10483772132326574
----------

Training 32/49
Total iteration: 17
Epoch 33/50 [15122020-233317]
AccVal: 0.8544141625902633
AUCVal: 0.9249393103568291
Precision: 0.853233814239502
Recall: 0.8386307954788208
F1Val: 0.8458692832214665
LossVal: 0.38

In [6]:
# Load pretrain and run evaluate on train / validate set
dataloader = make_dalaloader(datasetTrain, batch_size=1024)
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint='BOW_Raw-15122020-233156.pth.tar', model_name='BOW_Raw')
model_mlp.load_trained_model()
metrics = model_mlp.evaluate(dataloader)

LOAD PRETRAINED MODEL AT BOW_Raw-15122020-233156.pth.tar


In [7]:
metrics

{'accuracy': 0.9882942169937685,
 'precision': 0.98944914,
 'recall': 0.9859413,
 'f1': 0.9876921025509319,
 'tp': 8065,
 'tn': 8905,
 'fp': 86,
 'fn': 115,
 'auc': 0.9991705234166522}

# BOW - Lemmatised
Use BOW on lemmatised sentence

In [7]:
# Create BOW vectorizer and X_train, X_val
vectorizer = CountVectorizer(tokenizer=lambda text: text.split())
all_string = data_train.headline_s2.tolist()

# tokenize and build vocab
vectorizer.fit(all_string)

vocab = {k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}
vocab = list(vocab)

# encode document
# X_train_f = vectorizer.transform(data_train_f.headline.tolist())
# y_train_f = data_train_f.is_sarcastic.to_numpy()

X_train = vectorizer.transform(data_train.headline_s2.tolist()).toarray()
y_train = data_train.is_sarcastic.to_numpy()

X_val = vectorizer.transform(data_val.headline_s2.tolist()).toarray()
y_val = data_val.is_sarcastic.to_numpy()

# X_train, x_mean, x_std = normalize_data(X_train, [], [])
# X_val, _, _ = normalize_data(X_val, x_mean, x_std)



In [8]:
X_train.shape

(17171, 18194)

In [9]:
# Create Dataset data structure in PyTorch
datasetTrain = EncodingDataset(X_train, y_train)
datasetVal = EncodingDataset(X_val, y_val)

In [18]:
# Declare and train model
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint=None, model_name='BOW_Lem')
model_mlp.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 17
Epoch 1/50 [15122020-041700] [SAVE]
AccVal: 0.7484276729559748
AUCVal: 0.8874455098365078
Precision: 0.6690017580986023
Recall: 0.9339853525161743
F1Val: 0.7795918781476346
LossVal: 0.6625605344772338
LossTrain: 0.6022102762671078
----------

Training 1/49
Total iteration: 17
Epoch 2/50 [15122020-041701] [SAVE]
AccVal: 0.8211041229909154
AUCVal: 0.9052567237163813
Precision: 0.7828976511955261
Recall: 0.8640586733818054
F1Val: 0.8214783789841473
LossVal: 0.5933645009994507
LossTrain: 0.41566961653092327
----------

Training 2/49
Total iteration: 17
Epoch 3/50 [15122020-041703] [SAVE]
AccVal: 0.8364779874213837
AUCVal: 0.9130209738186184
Precision: 0.8312777280807495
Recall: 0.8239609003067017
F1Val: 0.8276031425167972
LossVal: 0.4717401385307312
LossTrain: 0.3379380667910856
----------

Training 3/49
Total iteration: 17
Epoch 4/50 [15122020-041705] [SAVE]
AccVal: 0.8420684835779175
AUCVal: 0.9162430718095519
Precision: 0.839205980300

KeyboardInterrupt: 

In [10]:
# Load pretrain and evaluate on train / validate set
dataloader = make_dalaloader(datasetTrain, batch_size=1024)
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint='BOW_Lem-15122020-041600.pth.tar', model_name='BOW_EMB')
model_mlp.load_trained_model()
metrics = model_mlp.evaluate(dataloader)
metrics

LOAD PRETRAINED MODEL AT BOW_Lem-15122020-041600.pth.tar


{'accuracy': 0.9543416225030574,
 'precision': 0.95429975,
 'recall': 0.94963324,
 'f1': 0.9519607457548951,
 'tp': 7768,
 'tn': 8619,
 'fp': 372,
 'fn': 412,
 'auc': 0.9901168691103491}

# TFIDF - Raw
Use TFIDF on original sentence (adter discard symbol)

In [8]:
# Create TFIDF vectorizer to build X_train, X_val
vectorizer = TfidfVectorizer(tokenizer=lambda text: text.split())
all_string = data_train.headline_s1.tolist()

# tokenize and build vocab
vectorizer.fit(all_string)

vocab = {k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}
vocab = list(vocab)

# encode document
# X_train_f = vectorizer.transform(data_train_f.headline.tolist())
# y_train_f = data_train_f.is_sarcastic.to_numpy()

X_train = vectorizer.transform(data_train.headline_s1.tolist()).toarray()
y_train = data_train.is_sarcastic.to_numpy()

X_val = vectorizer.transform(data_val.headline_s1.tolist()).toarray()
y_val = data_val.is_sarcastic.to_numpy()

# X_train, x_mean, x_std = normalize_data(X_train, [], [])
# X_val, _, _ = normalize_data(X_val, x_mean, x_std)



In [9]:
# Dataset data structure in PyTorch
datasetTrain = EncodingDataset(X_train, y_train)
datasetVal = EncodingDataset(X_val, y_val)

In [10]:
# Declare and Train model
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint=None, model_name='TFIDF_Raw')
model_mlp.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 17
Epoch 1/50 [15122020-233925] [SAVE]
AccVal: 0.4798509201024924
AUCVal: 0.8912924501213793
Precision: 0.4780271053314209
Recall: 1.0
F1Val: 0.6468448428410005
LossVal: 0.6890757799148559
LossTrain: 0.6222280439208535
----------

Training 1/49
Total iteration: 17
Epoch 2/50 [15122020-233927] [SAVE]
AccVal: 0.7372466806429071
AUCVal: 0.914713866822125
Precision: 0.6499836444854736
Recall: 0.9716381430625916
F1Val: 0.7789102033347823
LossVal: 0.6644232273101807
LossTrain: 0.4446595293634078
----------

Training 2/49
Total iteration: 17
Epoch 3/50 [15122020-233929] [SAVE]
AccVal: 0.8238993710691824
AUCVal: 0.920556604512351
Precision: 0.7684298157691956
Recall: 0.90220046043396
F1Val: 0.8299595286977396
LossVal: 0.5944712162017822
LossTrain: 0.35879873703507814
----------

Training 3/49
Total iteration: 17
Epoch 4/50 [15122020-233930] [SAVE]
AccVal: 0.841835546238062
AUCVal: 0.9236713536183208
Precision: 0.8194574117660522
Recall: 0.85672

Epoch 30/50 [15122020-234010]
AccVal: 0.847891917074307
AUCVal: 0.9248934124546458
Precision: 0.8445544838905334
Recall: 0.8342298269271851
F1Val: 0.8393603768771332
LossVal: 0.39748517870903016
LossTrain: 0.09571446653674631
----------

Training 30/49
Total iteration: 17
Epoch 31/50 [15122020-234011]
AccVal: 0.8481248544141626
AUCVal: 0.9248605660886284
Precision: 0.8453148007392883
Recall: 0.8337408304214478
F1Val: 0.8394878952640944
LossVal: 0.3984334051609039
LossTrain: 0.09187938755049425
----------

Training 31/49
Total iteration: 17
Epoch 32/50 [15122020-234013]
AccVal: 0.8485907290938738
AUCVal: 0.9248355506443109
Precision: 0.8461538553237915
Recall: 0.8337408304214478
F1Val: 0.8399014819062943
LossVal: 0.3995016574859619
LossTrain: 0.09238653630018234
----------

Training 32/49
Total iteration: 17
Epoch 33/50 [15122020-234014]
AccVal: 0.8483577917540182
AUCVal: 0.9248127104560206
Precision: 0.8460774421691895
Recall: 0.8332518339157104
F1Val: 0.8396156612785078
LossVal: 0.400

In [11]:
# Load pretrain and evaluate train / validate set
dataloader = make_dalaloader(datasetTrain, batch_size=1024)
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint='TFIDF_Raw-15122020-233924.pth.tar', model_name='TFIDF_Raw')
model_mlp.load_trained_model()
metrics = model_mlp.evaluate(dataloader)
metrics

LOAD PRETRAINED MODEL AT TFIDF_Raw-15122020-233924.pth.tar


{'accuracy': 0.970065808630831,
 'precision': 0.9719281,
 'recall': 0.9650367,
 'f1': 0.9684701754146052,
 'tp': 7894,
 'tn': 8763,
 'fp': 228,
 'fn': 286,
 'auc': 0.9952982730081346}

# TFIDF - Lemmatised
Use TFIDF on lemmatised sentence

In [14]:
# Create TFIDF vectorizer to have X_train, X_val
vectorizer = TfidfVectorizer(tokenizer=lambda text: text.split())
all_string = data_train.headline_s2.tolist()

# tokenize and build vocab
vectorizer.fit(all_string)

vocab = {k: v for k, v in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])}
vocab = list(vocab)

# encode document
# X_train_f = vectorizer.transform(data_train_f.headline.tolist())
# y_train_f = data_train_f.is_sarcastic.to_numpy()

X_train = vectorizer.transform(data_train.headline_s2.tolist()).toarray()
y_train = data_train.is_sarcastic.to_numpy()

X_val = vectorizer.transform(data_val.headline_s2.tolist()).toarray()
y_val = data_val.is_sarcastic.to_numpy()

# X_train, x_mean, x_std = normalize_data(X_train, [], [])
# X_val, _, _ = normalize_data(X_val, x_mean, x_std)



In [15]:
# Dataset data structure in PyTorch
datasetTrain = EncodingDataset(X_train, y_train)
datasetVal = EncodingDataset(X_val, y_val)

In [35]:
# Declare and train model
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint=None, model_name='TFIDF_Lem')
model_mlp.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 17


  precision = TP / (TP+FP)


Epoch 1/50 [15122020-042503] [SAVE]
AccVal: 0.5236431399953413
AUCVal: 0.8901080232143324
Precision: nan
Recall: 0.0
F1Val: nan
LossVal: 0.6851996302604675
LossTrain: 0.6262577281278723
----------

Training 1/49
Total iteration: 17
Epoch 2/50 [15122020-042504] [SAVE]
AccVal: 0.6261355695317959
AUCVal: 0.9084861088150075
Precision: 0.9888888597488403
Recall: 0.2176039069890976
F1Val: 0.35671341384570426
LossVal: 0.66132572889328
LossTrain: 0.45166406736654396
----------

Training 2/49
Total iteration: 17
Epoch 3/50 [15122020-042505] [SAVE]
AccVal: 0.8052643838807361
AUCVal: 0.9140851090673372
Precision: 0.8995373249053955
Recall: 0.6655256748199463
F1Val: 0.7650365324443226
LossVal: 0.595960795879364
LossTrain: 0.37428174649967866
----------

Training 3/49
Total iteration: 17
Epoch 4/50 [15122020-042507] [SAVE]
AccVal: 0.8322851153039832
AUCVal: 0.9163172480400943
Precision: 0.8548473715782166
Recall: 0.780440092086792
F1Val: 0.815950897809967
LossVal: 0.4771660089492798
LossTrain: 0.32

KeyboardInterrupt: 

In [16]:
# Load pretrain model and evaluate on train / validate
dataloader = make_dalaloader(datasetTrain, batch_size=1024)
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[X_train.shape[1], 128, 1], weight_decay=1e-3,
                     dropout=0.9, batchnorm=True, checkpoint='TFIDF_Lem-15122020-042501.pth.tar', model_name='TFIDF_Raw')
model_mlp.load_trained_model()
metrics = model_mlp.evaluate(dataloader)
metrics

LOAD PRETRAINED MODEL AT TFIDF_Lem-15122020-042501.pth.tar


{'accuracy': 0.9705899481684235,
 'precision': 0.97219145,
 'recall': 0.96589243,
 'f1': 0.9690317072443284,
 'tp': 7901,
 'tn': 8765,
 'fp': 226,
 'fn': 279,
 'auc': 0.9956610372937458}