# Text Vectorization
Encoded headline into vector with 4 cases:

1. BOW without unknown token (ignore unknow token)
2. BOW with unknown token
3. BOW + Remove Stopwords w/o unknown token
4. BOW + Remove Stopwords w unknown token
3. TFIDF without unknown token
4. TFIDF with unknown token

## Read Data

In [1]:
from myfunctions import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from deep_pytorch import *
from sklearn.preprocessing import StandardScaler
import joblib



In [2]:
data_full = pd.read_json('fake_news.json', lines=True)
data_full = data_full.drop(columns=['article_link']) # remove link column
df_train_f, df_test = split_dataframe(data_full, test_size=0.25, seed=1509)
df_train, df_validate = split_dataframe(df_train_f, test_size=0.2, seed=1309)

# Proportion of each subsets
list_label = df_train['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TRAINING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_validate['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== VALIDATING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_test['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TESTING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

===== TRAINING SAMPLES =====
Total Sample: 18316
Sarcastic: 8726 (47.64%)
Not Sarcastic: 9590 (52.36%)
===== VALIDATING SAMPLES =====
Total Sample: 4579
Sarcastic: 2181 (47.63%)
Not Sarcastic: 2398 (52.37%)
===== TESTING SAMPLES =====
Total Sample: 5724
Sarcastic: 2727 (47.64%)
Not Sarcastic: 2997 (52.36%)


In [3]:
data_train = df_train
data_train['headline_s1'] = data_train.headline.apply(lambda row: remove_symbol(row))
data_train['headline_s2'] = data_train.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train['headline_s2'] = data_train.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
data_train = data_train.drop(columns=['headline', 'headline_s1'])

data_val = df_validate
data_val['headline_s1'] = data_val.headline.apply(lambda row: remove_symbol(row))
data_val['headline_s2'] = data_val.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_val['headline_s2'] = data_val.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
data_val = data_val.drop(columns=['headline', 'headline_s1'])

In [4]:
data_train_rmsw = data_train.copy()
data_train_rmsw['headline_s3'] = data_train_rmsw.headline_s2.apply(lambda row: remove_stop_words(row))
data_train_rmsw = data_train_rmsw.drop(columns=['headline_s2'])

data_val_rmsw = data_val.copy()
data_val_rmsw['headline_s3'] = data_val_rmsw.headline_s2.apply(lambda row: remove_stop_words(row))
data_val_rmsw = data_val_rmsw.drop(columns=['headline_s2'])

## BOW entire dict without unknow token

### Bag of word

In [7]:
vectorizer = CountVectorizer()
all_string = data_train.headline_s2.tolist()
all_string_in_one = ' '.join(all_string)
# tokenize and build vocab
vectorizer.fit([all_string_in_one])
# encode document
vector = vectorizer.transform(data_train.headline_s2.tolist())
# summarize encoded vector
vector = vector.toarray()
data_train = data_train.drop(columns=['headline_s2'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_train[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_10000
Processing FT_11000
Processing FT_12000
Processing FT_13000
Processing FT_14000
Processing FT_15000
Processing FT_16000
Processing FT_17000
Processing FT_18000


In [8]:
# encode document
vector = vectorizer.transform(data_val.headline_s2.tolist())
# summarize encoded vector
vector = vector.toarray()
data_val = data_val.drop(columns=['headline_s2'])

In [9]:
for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_val[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_10000
Processing FT_11000
Processing FT_12000
Processing FT_13000
Processing FT_14000
Processing FT_15000
Processing FT_16000
Processing FT_17000
Processing FT_18000


In [8]:
# joblib.dump(data_train, 'bow_entire_train_ft.joblib')
# joblib.dump(data_val, 'bow_entire_val_ft.joblib')

['bow_entire_val_ft.joblib']

### Train

In [None]:
# X_train = data_train.drop(columns=['is_sarcastic'])
# y_train = data_train['is_sarcastic']
# X_train = np.asarray(X_train)
# y_train = np.asarray(y_train)

# X_val = data_val.drop(columns=['is_sarcastic'])
# y_val = data_val['is_sarcastic']
# X_val = np.asarray(X_val)
# y_val = np.asarray(y_val)

# # SVM need normalization
# scaler = StandardScaler().fit(data_train.iloc[:,1:])
# data_train.iloc[:,1:] = scaler.transform(data_train.iloc[:,1:])
# data_val.iloc[:,1:] = scaler.transform(data_val.iloc[:,1:])

In [10]:
datasetTrain = EncodingDataset(data_train)
datasetVal = EncodingDataset(data_val)

In [12]:
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=1024, optimizer_choice='adam', 
                     init_lr=0.001, layers=[18337, 2048, 512, 128, 1], weight_decay=1e-2,
                     dropout=0.75, batchnorm=True, checkpoint=None)
model_mlp.train()

TRAIN FROM SCRATCH
Training 0/99
Total iteration: 18
Epoch 1/100 [01122020-003536] [SAVE]
AccVal: 0.5260974011792968
AUCVal: 0.8825737977429607
Precision: 1.0
Recall: 0.005043557845056057
F1Val: 0.010036496553280806
LossVal: 0.6696352601051331
LossTrain: 0.61982029179732
----------

Training 1/99
Total iteration: 18
Epoch 2/100 [01122020-003736] [SAVE]
AccVal: 0.7043022493994322
AUCVal: 0.9130960425144137
Precision: 0.9569060802459717
Recall: 0.39706557989120483
F1Val: 0.5612443186431356
LossVal: 0.582669472694397
LossTrain: 0.3939804749356376
----------

Training 2/99
Total iteration: 18
Epoch 3/100 [01122020-003936] [SAVE]
AccVal: 0.764359030355973
AUCVal: 0.9157929636457709
Precision: 0.9264705777168274
Recall: 0.5488308072090149
F1Val: 0.6893175865205168
LossVal: 0.4749447166919708
LossTrain: 0.2560974508523941
----------

Training 3/99
Total iteration: 18
Epoch 4/100 [01122020-004137] [SAVE]
AccVal: 0.816335444420179
AUCVal: 0.9086853671044073
Precision: 0.8738839030265808
Recall:

KeyboardInterrupt: 

## BOW entire with dict unknown token
Firstly we want to convert least common words into ```unknown``` token and keep the remaining words as BoW model. All words which are not belong to BoW will be considered as ```unknown``` token.

In [13]:
all_string = data_train.headline_s2.tolist()
all_string_in_one = ' '.join(all_string)
list_common_words, count_words = most_common_words(all_string_in_one, numb_words=-1)

cwdf = pd.DataFrame(np.asarray(count_words),
                    columns=['count_words'])
cwdf['words'] = list_common_words

In [14]:
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
print(f"Number of unique words remaining: {s}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 9468
Total discard (Unknown Token): 9266


In [15]:
cwdf.head()

Unnamed: 0,count_words,words
0,5784,to
1,4050,of
2,3455,the
3,2918,in
4,2640,be


In [17]:
list_vocab = cwdf.words[0:s].tolist()
list_vocab[0:10]

['to', 'of', 'the', 'in', 'be', 'a', 'for', 'on', 'and', 'with']

In [23]:
vectorizer = CountVectorizer()
all_string_in_one = ' '.join(list_vocab)
all_string_in_one += ' UNK'
# tokenize and build vocab
vectorizer.fit([all_string_in_one])

CountVectorizer()

In [26]:
vocab = list(vectorizer.vocabulary_.keys())

In [30]:
def add_unknown_token(sent, vocab):
    sent_s = sent.split()
    for idx, s in enumerate(sent_s):
        if s not in vocab:
            sent_s[idx] = 'UNK'
    psent = ' '.join(sent_s)
    return psent

data_train['preprocess'] = data_train.headline_s2.apply(lambda row: add_unknown_token(row, vocab))
# encode document
vector = vectorizer.transform(data_train.preprocess.tolist())
# summarize encoded vector
vector = vector.toarray()
data_train = data_train.drop(columns=['headline_s2', 'preprocess'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_train[f"ft_{idx}"] = vector[:,idx]
    
# encode document
data_val['preprocess'] = data_val.headline_s2.apply(lambda row: add_unknown_token(row, vocab))
vector = vectorizer.transform(data_val.preprocess.tolist())
# summarize encoded vector
vector = vector.toarray()
data_val = data_val.drop(columns=['headline_s2', 'preprocess'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_val[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000


In [32]:
data_train.shape

(18316, 9382)

In [33]:
datasetTrain = EncodingDataset(data_train)
datasetVal = EncodingDataset(data_val)

In [35]:
model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=2048, optimizer_choice='adam', 
                     init_lr=0.001, layers=[9381, 1024, 128, 1], weight_decay=1e-2,
                     dropout=0.75, batchnorm=True, checkpoint=None)
model_mlp.train()

TRAIN FROM SCRATCH
Training 0/99
Total iteration: 9
Epoch 1/100 [01122020-021905] [SAVE]
AccVal: 0.7774623280192182
AUCVal: 0.8811015331054957
Precision: 0.8534063100814819
Recall: 0.6432828903198242
F1Val: 0.7335947605143323
LossVal: 0.6762522260348002
LossTrain: 0.6165444387329949
----------

Training 1/99
Total iteration: 9
Epoch 2/100 [01122020-022032] [SAVE]
AccVal: 0.7080148504040183
AUCVal: 0.9052672275038919
Precision: 0.9460887908935547
Recall: 0.41036221385002136
F1Val: 0.5724336239822184
LossVal: 0.6396872798601786
LossTrain: 0.4128439625104268
----------

Training 2/99
Total iteration: 9
Epoch 3/100 [01122020-022336] [SAVE]
AccVal: 0.6920725049137366
AUCVal: 0.913232465997379
Precision: 0.965018093585968
Recall: 0.36680421233177185
F1Val: 0.5315614673996237
LossVal: 0.5992210507392883
LossTrain: 0.31554121110174393
----------

Training 3/99
Total iteration: 9
Epoch 4/100 [01122020-022442] [SAVE]
AccVal: 0.6999344835116837
AUCVal: 0.9140359974439956
Precision: 0.958002269268

KeyboardInterrupt: 

## BOW entire dict without unknow token (Remove Stopwords)

In [5]:
vectorizer = CountVectorizer()
all_string = data_train_rmsw.headline_s3.tolist()
all_string_in_one = ' '.join(all_string)
# tokenize and build vocab
vectorizer.fit([all_string_in_one])
# encode document
vector = vectorizer.transform(data_train_rmsw.headline_s3.tolist())
# summarize encoded vector
vector = vector.toarray()
dt_train = data_train_rmsw.drop(columns=['headline_s3']) 
for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    dt_train[f"ft_{idx}"] = vector[:,idx]
    
# encode document
vector = vectorizer.transform(data_val_rmsw.headline_s3.tolist())
# summarize encoded vector
vector = vector.toarray()
dt_val = data_val_rmsw.drop(columns=['headline_s3'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    dt_val[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_10000
Processing FT_11000
Processing FT_12000
Processing FT_13000
Processing FT_14000
Processing FT_15000
Processing FT_16000
Processing FT_17000
Processing FT_18000
Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_10000
Processing FT_11000
Processing FT_12000
Processing FT_13000
Processing FT_14000
Processing FT_15000
Processing FT_16000
Processing FT_17000
Processing FT_18000


In [6]:
dt_train.shape

(18316, 18265)

In [8]:
datasetTrain = EncodingDataset(dt_train)
datasetVal = EncodingDataset(dt_val)

model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=512, optimizer_choice='adam', 
                     init_lr=0.001, layers=[18264, 512, 64, 1], weight_decay=1e-2,
                     dropout=0.75, batchnorm=True, checkpoint=None)
model_mlp.train()

TRAIN FROM SCRATCH
Training 0/99
Total iteration: 36
Epoch 1/100 [02122020-014804] [SAVE]
AccVal: 0.525660624590522
AUCVal: 0.8583078172663374
Precision: 1.0
Recall: 0.004126547370105982
F1Val: 0.008219177901774902
LossVal: 0.6823680202166239
LossTrain: 0.575858806570371
----------

Training 1/99
Total iteration: 36
Epoch 2/100 [02122020-015010] [SAVE]
AccVal: 0.7464511902162044
AUCVal: 0.8664425573963325
Precision: 0.8846153616905212
Recall: 0.537826657295227
F1Val: 0.6689478186892435
LossVal: 0.5098949273427328
LossTrain: 0.36468082583612865
----------

Training 2/99
Total iteration: 36
Epoch 3/100 [02122020-015218] [SAVE]
AccVal: 0.7746232801921817
AUCVal: 0.8597961429725751
Precision: 0.7911809682846069
Recall: 0.7157267332077026
F1Val: 0.7515647696878383
LossVal: 0.48627598418129814
LossTrain: 0.30643068502346676
----------

Training 3/99
Total iteration: 36
Epoch 4/100 [02122020-015424] [SAVE]
AccVal: 0.7759336099585062
AUCVal: 0.8595932763777242
Precision: 0.7753934264183044
Rec

KeyboardInterrupt: 

## BOW entire dict with unknow token (Remove Stopwords)

In [9]:
all_string = data_train_rmsw.headline_s3.tolist()
all_string_in_one = ' '.join(all_string)
list_common_words, count_words = most_common_words(all_string_in_one, numb_words=-1)

cwdf = pd.DataFrame(np.asarray(count_words),
                    columns=['count_words'])
cwdf['words'] = list_common_words

In [10]:
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
print(f"Number of unique words remaining: {s}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")
cwdf.head()

Number of unique words remaining: 9355
Total discard (Unknown Token): 9262


Unnamed: 0,count_words,words
0,1128,trump
1,1040,new
2,958,man
3,653,get
4,609,woman


In [11]:
list_vocab = cwdf.words[0:s].tolist()
list_vocab += ['UNK']
list_vocab[0:10]

['trump', 'new', 'man', 'get', 'woman', 'make', 'say', 'report', 'u', 'time']

In [12]:
vectorizer = CountVectorizer()
all_string_in_one = ' '.join(list_vocab)
# tokenize and build vocab
vectorizer.fit([all_string_in_one]);
vocab = list(vectorizer.vocabulary_.keys())

In [13]:
def add_unknown_token(sent, vocab):
    sent_s = sent.split()
    for idx, s in enumerate(sent_s):
        if s not in vocab:
            sent_s[idx] = 'UNK'
    psent = ' '.join(sent_s)
    return psent

data_train_rmsw['preprocess'] = data_train_rmsw.headline_s3.apply(lambda row: add_unknown_token(row, vocab))
# encode document
vector = vectorizer.transform(data_train_rmsw.preprocess.tolist())
# summarize encoded vector
vector = vector.toarray()
data_train_rmsw = data_train_rmsw.drop(columns=['headline_s3', 'preprocess'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_train_rmsw[f"ft_{idx}"] = vector[:,idx]
    
# encode document
data_val_rmsw['preprocess'] = data_val_rmsw.headline_s3.apply(lambda row: add_unknown_token(row, vocab))
vector = vectorizer.transform(data_val_rmsw.preprocess.tolist())
# summarize encoded vector
vector = vector.toarray()
data_val_rmsw = data_val_rmsw.drop(columns=['headline_s3', 'preprocess'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    data_val_rmsw[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000


In [14]:
data_train_rmsw.shape

(18316, 9286)

In [16]:
dt_train = data_train_rmsw.copy()
dt_val = data_val_rmsw.copy()
datasetTrain = EncodingDataset(dt_train)
datasetVal = EncodingDataset(dt_val)

model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=512, optimizer_choice='adam', 
                     init_lr=0.001, layers=[9285, 1024, 128, 1], weight_decay=1e-5,
                     dropout=0.75, batchnorm=True, checkpoint=None)
model_mlp.train()

TRAIN FROM SCRATCH
Training 0/99
Total iteration: 36
Epoch 1/100 [02122020-033218] [SAVE]
AccVal: 0.5306835553614326
AUCVal: 0.8509029953510854
Precision: 1.0
Recall: 0.014672168530523777
F1Val: 0.028920017926528548
LossVal: 0.6690416137377421
LossTrain: 0.6094964543978373
----------

Training 1/99
Total iteration: 36
Epoch 2/100 [02122020-033328] [SAVE]
AccVal: 0.771129067481983
AUCVal: 0.8693792473400768
Precision: 0.8605983257293701
Recall: 0.6198991537094116
F1Val: 0.7206823128204218
LossVal: 0.4781721962822808
LossTrain: 0.40572494599554276
----------

Training 2/99
Total iteration: 36
Epoch 3/100 [02122020-033437] [SAVE]
AccVal: 0.7772439397248307
AUCVal: 0.8640366475348744
Precision: 0.8021863698959351
Recall: 0.7065566182136536
F1Val: 0.7513407793983922
LossVal: 0.4655287232663896
LossTrain: 0.32748667316304314
----------

Training 3/99
Total iteration: 36
Epoch 4/100 [02122020-033544]
AccVal: 0.7687267962437213
AUCVal: 0.8606485268367075
Precision: 0.793410062789917
Recall: 0.

KeyboardInterrupt: 

## TFIDF Entire w/o UNK

In [None]:
vectorizer = TfidfVectorizer()
# tokenize and build vocab
all_string = data_train.headline_s2.tolist()
vectorizer.fit(all_string)
vector = vectorizer.transform(data_train.headline_s2.tolist())
vector = vector.toarray()

dt_train = data_train.drop(columns=['headline_s2'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    dt_train[f"ft_{idx}"] = vector[:,idx]
    
# encode document
vector = vectorizer.transform(data_val.headline_s2.tolist())
# summarize encoded vector
vector = vector.toarray()
dt_val = data_val.drop(columns=['headline_s2'])

for idx in range(vector.shape[1]):
    if idx % 1000 == 0:
        print(f'Processing FT_{idx}')
    dt_val[f"ft_{idx}"] = vector[:,idx]

Processing FT_0
Processing FT_1000
Processing FT_2000
Processing FT_3000
Processing FT_4000
Processing FT_5000
Processing FT_6000
Processing FT_7000
Processing FT_8000
Processing FT_9000
Processing FT_10000
Processing FT_11000
Processing FT_12000
Processing FT_13000
Processing FT_14000
Processing FT_15000
Processing FT_16000


In [None]:
dt_train.shape

In [None]:
datasetTrain = EncodingDataset(dt_train)
datasetVal = EncodingDataset(dt_val)

model_mlp = ModelMLP(datasetTrain=datasetTrain, datasetVal=datasetVal, batch_size=256, optimizer_choice='adam', 
                     init_lr=0.001, layers=[18337, 2048, 128, 1], weight_decay=1e-3,
                     dropout=0.75, batchnorm=True, checkpoint=None)
model_mlp.train()