# Embedding Model
In this notebook, I implemented the words embedding model with different settings

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

from myfunctions import *
import joblib

from embedding_pytorch import *
from tqdm import tqdm

# Find and sort list of common words in a sentence based on its occurence in the sentence
def most_common_words(sent, numb_words=20):
    words = sent.split()
    wordCount = Counter(words)
    wordCount = wordCount.most_common()
    if numb_words > len(wordCount) or numb_words < 0:
        numb_words = len(wordCount)
    top_words = [x[0] for x in wordCount[:numb_words]]
    count_words = [x[1] for x in wordCount[:numb_words]]
    return top_words, count_words

def add_unknown_token(sent, vocab):
    sent_s = sent.split()
    for idx, s in enumerate(sent_s):
        if s not in vocab:
            sent_s[idx] = '<unk>'
    psent = ' '.join(sent_s)
    return psent

def remove_unknown_token(sent, vocab):
    sent_s = sent.split()
    psent = [x for x in sent_s if x in vocab]
    psent = ' '.join(psent)
    return psent

# Get the embedded feature of a word from a pretrained model (Glove or Word2Vec)
def get_word_features(w, model):
    try:
        feature = model[w]
        flag = 1
    except:
        feature = np.random.uniform(-2.5, 2.5, 300)
        file1 = open("KeyError_Glove.txt","a") 
        file1.write(f'{w}\n')
        file1.close() 
        flag = 0 # Error checker
    return feature, flag



In [None]:
# Load pretrain Glove model
embeddings_dict = {}
with open("../glove/glove.6B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [2]:
# Read dataset
data_full = pd.read_json('fake_news.json', lines=True)
data_full = data_full.drop(columns=['article_link']) # remove link column
df_train_f, df_test = split_dataframe(data_full, test_size=0.25, seed=1509)
df_train, df_validate = split_dataframe(df_train_f, test_size=0.2, seed=1309)

# Proportion of each subsets
list_label = df_train['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TRAINING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_validate['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== VALIDATING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

list_label = df_test['is_sarcastic'].tolist()
numb_total = len(list_label)
numb_sarcastic = np.sum(np.asarray(list_label))
numb_not_sarcastic = numb_total - numb_sarcastic
print(f'===== TESTING SAMPLES =====\nTotal Sample: {numb_total}\nSarcastic: {numb_sarcastic} ({np.round(numb_sarcastic/numb_total*100,2)}%)\nNot Sarcastic: {numb_not_sarcastic} ({np.round(numb_not_sarcastic/numb_total*100,2)}%)')

data_train = df_train.copy()
data_train_f = df_train_f.copy()
data_val = df_validate.copy()
data_test = df_test.copy()

# Preprocessing
# S1 - discard symbol out of headlines
# S2 - lemmatised headline_s1

data_train = df_train
data_train['headline'] = data_train.headline.apply(lambda row: row.lower())
data_train['headline_s1'] = data_train.headline.apply(lambda row: remove_symbol(row))
data_train['headline_s2'] = data_train.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train['headline_s2'] = data_train.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
#data_train = data_train.drop(columns=['headline', 'headline_s1'])

data_train_f = df_train_f
data_train_f['headline'] = data_train_f.headline.apply(lambda row: row.lower())
data_train_f['headline_s1'] = data_train_f.headline.apply(lambda row: remove_symbol(row))
data_train_f['headline_s2'] = data_train_f.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_train_f['headline_s2'] = data_train_f.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))

data_val = df_validate
data_val['headline'] = data_val.headline.apply(lambda row: row.lower())
data_val['headline_s1'] = data_val.headline.apply(lambda row: remove_symbol(row))
data_val['headline_s2'] = data_val.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_val['headline_s2'] = data_val.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))
#data_val = data_val.drop(columns=['headline', 'headline_s1'])

data_test = df_test
data_test['headline'] = data_test.headline.apply(lambda row: row.lower())
data_test['headline_s1'] = data_test.headline.apply(lambda row: remove_symbol(row))
data_test['headline_s2'] = data_test.headline_s1.apply(lambda row: lemmatize_word(row, 'v'))
data_test['headline_s2'] = data_test.headline_s2.apply(lambda row: lemmatize_word(row, 'n'))

===== TRAINING SAMPLES =====
Total Sample: 17171
Sarcastic: 8180 (47.64%)
Not Sarcastic: 8991 (52.36%)
===== VALIDATING SAMPLES =====
Total Sample: 4293
Sarcastic: 2045 (47.64%)
Not Sarcastic: 2248 (52.36%)
===== TESTING SAMPLES =====
Total Sample: 7155
Sarcastic: 3409 (47.65%)
Not Sarcastic: 3746 (52.35%)


# Embedding with UNK
Using entire vocabulary + Using UNK Token

In [3]:
dt_train = data_train.copy()
dt_val = data_val.copy()
# Count the frequency of words in the training corpus
all_string = dt_train.headline_s1.tolist()
all_string_in_one = ' '.join(all_string)
list_common_words, count_words = most_common_words(all_string_in_one, numb_words=-1)

In [4]:
cwdf = pd.DataFrame(np.asarray(count_words),
                    columns=['count_words'])
cwdf['words'] = list_common_words

# Find the index of words appeared only once in the corpus
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
s = s + 21115 - 11476 # Only take 2000 unknown words into UNK token
common_vocab = cwdf.words[0:s].tolist()
print(f"Number of unique words remaining: {s} / {len(cwdf)}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 21115 / 23115
Total discard (Unknown Token): 2000


In [5]:
# Convert unknown words into UNK tokens in the dataset
dt_train['preprocess'] = dt_train.headline_s1.apply(lambda row: add_unknown_token(row, common_vocab))
dt_val['preprocess'] = dt_val.headline_s1.apply(lambda row: add_unknown_token(row, common_vocab))

In [6]:
# Add unknown token to the vocab
list_vocab = ['<unk>'] + common_vocab
print(f"Total Vocabulary: {len(list_vocab)}")

Total Vocabulary: 21116


In [7]:
# Initialise weight by using pretrained Glove model
embedding_weight = np.zeros((len(list_vocab), 300)) # index is use when words is not indictionary
unknown_word = dict()
embedding_weight[0] = np.random.uniform(-2.5, 2.5, 300)
for idx in tqdm(range(1, len(list_vocab))):
    word = list_vocab[idx]
    word_split = word.split()
    if len(word_split) == 1:
        feature, flag = get_word_features(word_split[0], embeddings_dict)
        if flag == 0: # not exist word
            unknown_word[word_split[0]] = feature
    else:
        feature = np.zeros((len(word_split), 300))
        count = 0
        for idx_w, w in enumerate(word_split):
            ft, flag = get_word_features(w, embeddings_dict)
            if flag == 0:
                if w in list(unknown_word.keys()):
                    ft = unknown_word[w]
                else:
                    unknown_word[w] = ft
            feature[idx_w] = ft
            count = count + (1-flag)
        feature[idx_w] = feature[idx_w] # + 1e-5 # avoid divided by 0    
        feature = np.mean(feature, axis=0) # equal to divided by len(word_split) or count (not recommend)
              
    embedding_weight[idx] = feature

100%|██████████| 21115/21115 [00:00<00:00, 177586.47it/s]


In [8]:
# Create Dataset Pytorch datastructure
datasetTrain = EmbeddingDataset(dt_train, col_name='preprocess', list_vocab=list_vocab)
datasetVal = EmbeddingDataset(dt_val, col_name='preprocess', list_vocab=list_vocab)

In [10]:
# Declare the model and Train
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint=None, model_name='EMB_UNK_Raw')

model_emb.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 9
Epoch 1/50 [16122020-010000] [SAVE]
AccVal: 0.5490333100395993
AUCVal: 0.6901888339757589
Precision: 0.7853403091430664
Recall: 0.07334963232278824
F1Val: 0.13416815230449147
LossVal: 0.6795703371365865
LossTrain: 0.6858542097939385
----------

Training 1/49
Total iteration: 9
Epoch 2/50 [16122020-010006] [SAVE]
AccVal: 0.6361518751455858
AUCVal: 0.7355047464086523
Precision: 0.7098175287246704
Recall: 0.3995110094547272
F1Val: 0.5112640940677248
LossVal: 0.6651212175687155
LossTrain: 0.6716833975580003
----------

Training 2/49
Total iteration: 9
Epoch 3/50 [16122020-010012] [SAVE]
AccVal: 0.6727230375029117
AUCVal: 0.7538511820341254
Precision: 0.6939393877983093
Recall: 0.5599021911621094
F1Val: 0.6197563899801919
LossVal: 0.6528047323226929
LossTrain: 0.6584054430325826
----------

Training 3/49
Total iteration: 9
Epoch 4/50 [16122020-010018] [SAVE]
AccVal: 0.7044025157232704
AUCVal: 0.7834901330386588
Precision: 0.716035664081573

Epoch 31/50 [16122020-010306]
AccVal: 0.8490566037735849
AUCVal: 0.9261759434085393
Precision: 0.8408979773521423
Recall: 0.8425427675247192
F1Val: 0.8417195987272311
LossVal: 0.35854379336039227
LossTrain: 0.1458775583240721
----------

Training 31/49
Total iteration: 9
Epoch 32/50 [16122020-010312] [SAVE]
AccVal: 0.8502212904728628
AUCVal: 0.9260062734383837
Precision: 0.8426197171211243
Recall: 0.8430317640304565
F1Val: 0.8428256604123323
LossVal: 0.3593381444613139
LossTrain: 0.13978082769446903
----------

Training 32/49
Total iteration: 9
Epoch 33/50 [16122020-010318]
AccVal: 0.8483577917540182
AUCVal: 0.9257852674259761
Precision: 0.84033203125
Recall: 0.8415647745132446
F1Val: 0.8409479511131346
LossVal: 0.36033787329991657
LossTrain: 0.13402495864364836
----------

Training 33/49
Total iteration: 9
Epoch 34/50 [16122020-010325]
AccVal: 0.8483577917540182
AUCVal: 0.9255475119421557
Precision: 0.8399999737739563
Recall: 0.8420537710189819
F1Val: 0.8410255887450995
LossVal: 0.361

KeyboardInterrupt: 

In [10]:
# Load pretrained model and evaluate on train / validate set
dataloader = make_EmbeddingDataLoader(datasetTrain, batch_size=2048)
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint='Checkpoint/EMB_UNK_Raw-16122020-005954.pth.tar',
                           model_name='EMB_UNK_Raw')
model_emb.load_trained_model()
model_emb.evaluate(dataloader)

LOAD PRETRAINED MODEL AT Checkpoint/EMB_UNK_Raw-16122020-005954.pth.tar


{'accuracy': 0.9622619532933434,
 'precision': 0.9640217,
 'recall': 0.9564792,
 'f1': 0.960235626235471,
 'tp': 7824,
 'tn': 8699,
 'fp': 292,
 'fn': 356,
 'auc': 0.9928722528559528}

# Discard UNK

In [11]:
dt_train = data_train.copy()
dt_val = data_val.copy()
# Count the frequency of words in the training corpus
all_string = dt_train.headline_s1.tolist()
all_string_in_one = ' '.join(all_string)
list_common_words, count_words = most_common_words(all_string_in_one, numb_words=-1)

In [12]:
cwdf = pd.DataFrame(np.asarray(count_words),
                    columns=['count_words'])
cwdf['words'] = list_common_words

# Find the index of words appeared only once in the corpus
# Only use words appearing more than once in the training corpus
s = cwdf.index[cwdf.iloc[:,0] == 1].tolist()[0]
common_vocab = cwdf.words[0:s].tolist()
print(f"Number of unique words remaining: {s} / {len(cwdf)}")
print(f"Total discard (Unknown Token): {cwdf.iloc[s:,0].sum()}")

Number of unique words remaining: 11476 / 23115
Total discard (Unknown Token): 11639


In [13]:
# remove unknown words in the dataset
dt_train['preprocess'] = dt_train.headline_s1.apply(lambda row: remove_unknown_token(row, common_vocab))
dt_val['preprocess'] = dt_val.headline_s1.apply(lambda row: remove_unknown_token(row, common_vocab))

In [14]:
list_vocab = common_vocab
print(f"Total Vocabulary: {len(list_vocab)}")

Total Vocabulary: 11476


In [15]:
# initialise weight by Glove model
embedding_weight = np.zeros((len(list_vocab), 300)) # index is use when words is not indictionary
unknown_word = dict()
for idx in tqdm(range(len(list_vocab))):
    word = list_vocab[idx]
    word_split = word.split()
    if len(word_split) == 1:
        feature, flag = get_word_features(word_split[0], embeddings_dict)
        if flag == 0: # not exist word
            unknown_word[word_split[0]] = feature
    else:
        feature = np.zeros((len(word_split), 300))
        count = 0
        for idx_w, w in enumerate(word_split):
            ft, flag = get_word_features(w, embeddings_dict)
            if flag == 0:
                if w in list(unknown_word.keys()):
                    ft = unknown_word[w]
                else:
                    unknown_word[w] = ft
            feature[idx_w] = ft
            count = count + (1-flag)
        feature[idx_w] = feature[idx_w] # + 1e-5 # avoid divided by 0    
        feature = np.mean(feature, axis=0) # equal to divided by len(word_split) or count (not recommend)
              
    embedding_weight[idx] = feature

100%|██████████| 11476/11476 [00:00<00:00, 436366.41it/s]


In [16]:
# Create Dataset Pytorch datastructure
datasetTrain = EmbeddingDataset(dt_train, col_name='preprocess', list_vocab=list_vocab)
datasetVal = EmbeddingDataset(dt_val, col_name='preprocess', list_vocab=list_vocab)

In [63]:
# Declare and Train model
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint=None, model_name='EMB_Raw')

model_emb.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 9
Epoch 1/50 [16122020-015801] [SAVE]
AccVal: 0.5709294199860238
AUCVal: 0.6686292189090657
Precision: 0.6661211252212524
Recall: 0.19902200996875763
F1Val: 0.3064759162158546
LossVal: 0.6795707941055298
LossTrain: 0.6873425311512418
----------

Training 1/49
Total iteration: 9
Epoch 2/50 [16122020-015805] [SAVE]
AccVal: 0.5942231539715817
AUCVal: 0.7610997876950116
Precision: 0.8110883235931396
Recall: 0.19315403699874878
F1Val: 0.31200630627394255
LossVal: 0.6661047538121542
LossTrain: 0.6713760428958468
----------

Training 2/49
Total iteration: 9
Epoch 3/50 [16122020-015809] [SAVE]
AccVal: 0.6561844863731656
AUCVal: 0.8084440176108728
Precision: 0.836686372756958
Recall: 0.3457212746143341
F1Val: 0.4892733689194232
LossVal: 0.6512021025021871
LossTrain: 0.6556229525142245
----------

Training 3/49
Total iteration: 9
Epoch 4/50 [16122020-015813] [SAVE]
AccVal: 0.7216398788725833
AUCVal: 0.8365469115714919
Precision: 0.826923072338104

Epoch 31/50 [16122020-020145]
AccVal: 0.8427672955974843
AUCVal: 0.9213899450965379
Precision: 0.8411354422569275
Recall: 0.8259168863296509
F1Val: 0.8334567288338386
LossVal: 0.3885838786760966
LossTrain: 0.15266832792096668
----------

Training 31/49
Total iteration: 9
Epoch 32/50 [16122020-020201]
AccVal: 0.8420684835779175
AUCVal: 0.9210983520260334
Precision: 0.8408977389335632
Recall: 0.824449896812439
F1Val: 0.8325926239373141
LossVal: 0.39121275146802265
LossTrain: 0.14745755907562044
----------

Training 32/49
Total iteration: 9
Epoch 33/50 [16122020-020216]
AccVal: 0.8416026088982064
AUCVal: 0.9207887043304998
Precision: 0.8403990268707275
Recall: 0.8239609003067017
F1Val: 0.832098787636169
LossVal: 0.3940335313479106
LossTrain: 0.14252269433604348
----------

Training 33/49
Total iteration: 9
Epoch 34/50 [16122020-020232]
AccVal: 0.8427672955974843
AUCVal: 0.920480470551384
Precision: 0.8421578407287598
Recall: 0.824449896812439
F1Val: 0.8332097940610321
LossVal: 0.397031774

KeyboardInterrupt: 

In [17]:
# Load pretrained model and evaluate
dataloader = make_EmbeddingDataLoader(datasetTrain, batch_size=2048)
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint='Checkpoint/EMB_Raw-16122020-015756.pth.tar', 
                           model_name='EMB_Raw')
model_emb.load_trained_model()
model_emb.evaluate(dataloader)

LOAD PRETRAINED MODEL AT Checkpoint/EMB_Raw-16122020-015756.pth.tar


{'accuracy': 0.933958418263351,
 'precision': 0.9351532,
 'recall': 0.9255501,
 'f1': 0.9303268938167982,
 'tp': 7571,
 'tn': 8466,
 'fp': 525,
 'fn': 609,
 'auc': 0.9819344405530224}

# Embedding with UNK and Selected Vocab
Using UNK token but only use 2457 selected words

In [18]:
dt_train = data_train.copy()
dt_val = data_val.copy()

# Load list of selected words
common_vocab = joblib.load('selected_vocab_s1.joblib')

# Replace unknown words with UNK token
dt_train['preprocess'] = dt_train.headline_s1.apply(lambda row: add_unknown_token(row, common_vocab))
dt_val['preprocess'] = dt_val.headline_s1.apply(lambda row: add_unknown_token(row, common_vocab))

In [19]:
# Add unk token
list_vocab = ['<unk>'] + common_vocab
print(f"Total Vocabulary: {len(list_vocab)}")

Total Vocabulary: 2458


In [20]:
# Initialise weight by Glove model
embedding_weight = np.zeros((len(list_vocab), 300)) # index is use when words is not indictionary
unknown_word = dict()
embedding_weight[0] = np.random.uniform(-2.5, 2.5, 300)
for idx in tqdm(range(1, len(list_vocab))):
    word = list_vocab[idx]
    word_split = word.split()
    if len(word_split) == 1:
        feature, flag = get_word_features(word_split[0], embeddings_dict)
        if flag == 0: # not exist word
            unknown_word[word_split[0]] = feature
    else:
        feature = np.zeros((len(word_split), 300))
        count = 0
        for idx_w, w in enumerate(word_split):
            ft, flag = get_word_features(w, embeddings_dict)
            if flag == 0:
                if w in list(unknown_word.keys()):
                    ft = unknown_word[w]
                else:
                    unknown_word[w] = ft
            feature[idx_w] = ft
            count = count + (1-flag)
        feature[idx_w] = feature[idx_w] # + 1e-5 # avoid divided by 0    
        feature = np.mean(feature, axis=0) # equal to divided by len(word_split) or count (not recommend)
              
    embedding_weight[idx] = feature

100%|██████████| 2457/2457 [00:00<00:00, 386447.85it/s]


In [21]:
# Create dataset data structure of Pytorch
datasetTrain = EmbeddingDataset(dt_train, col_name='preprocess', list_vocab=list_vocab)
datasetVal = EmbeddingDataset(dt_val, col_name='preprocess', list_vocab=list_vocab)

In [69]:
# Declare and Train model
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint=None, model_name='EMB_UNK_Raw_Light')

model_emb.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 9
Epoch 1/50 [16122020-020950] [SAVE]
AccVal: 0.59958071278826
AUCVal: 0.6392825135518451
Precision: 0.5726381540298462
Recall: 0.6283618807792664
F1Val: 0.5992073187360426
LossVal: 0.6808372537295023
LossTrain: 0.6865804261631436
----------

Training 1/49
Total iteration: 9
Epoch 2/50 [16122020-020953] [SAVE]
AccVal: 0.6245050081528069
AUCVal: 0.6931211878638116
Precision: 0.6638910174369812
Recall: 0.42885085940361023
F1Val: 0.5210933106476258
LossVal: 0.6709955135981241
LossTrain: 0.6756826506720649
----------

Training 2/49
Total iteration: 9
Epoch 3/50 [16122020-020955] [SAVE]
AccVal: 0.6429070580013976
AUCVal: 0.7224701772398612
Precision: 0.6916167736053467
Recall: 0.4518337547779083
F1Val: 0.5465838776113282
LossVal: 0.6614372134208679
LossTrain: 0.6651503841082255
----------

Training 3/49
Total iteration: 9
Epoch 4/50 [16122020-020958] [SAVE]
AccVal: 0.6655019799673888
AUCVal: 0.737598321572449
Precision: 0.6809269189834595
Re

Epoch 31/50 [16122020-021102] [SAVE]
AccVal: 0.8413696715583509
AUCVal: 0.9151934020134169
Precision: 0.8285163640975952
Recall: 0.8410757780075073
F1Val: 0.8347488323066976
LossVal: 0.38847771286964417
LossTrain: 0.25745272305276656
----------

Training 31/49
Total iteration: 9
Epoch 32/50 [16122020-021105] [SAVE]
AccVal: 0.8413696715583509
AUCVal: 0.9152609437130751
Precision: 0.8282002210617065
Recall: 0.8415647745132446
F1Val: 0.8348290138273278
LossVal: 0.3883997102578481
LossTrain: 0.2524148225784302
----------

Training 32/49
Total iteration: 9
Epoch 33/50 [16122020-021108]
AccVal: 0.8409037968786397
AUCVal: 0.9153600266251338
Precision: 0.8270893096923828
Recall: 0.8420537710189819
F1Val: 0.8345044594968324
LossVal: 0.38855721553166706
LossTrain: 0.24770012166765001
----------

Training 33/49
Total iteration: 9
Epoch 34/50 [16122020-021110]
AccVal: 0.8406708595387841
AUCVal: 0.9154124502953997
Precision: 0.8270062208175659
Recall: 0.8415647745132446
F1Val: 0.8342219847893265
Lo

In [22]:
# Load pretrain and evaluate model
dataloader = make_EmbeddingDataLoader(datasetTrain, batch_size=2048)
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint='Checkpoint/EMB_UNK_Raw_Light-16122020-020947.pth.tar', 
                           model_name='EMB_UNK_Raw_Light')
model_emb.load_trained_model()
model_emb.evaluate(dataloader)

LOAD PRETRAINED MODEL AT Checkpoint/EMB_UNK_Raw_Light-16122020-020947.pth.tar


{'accuracy': 0.9093820977229049,
 'precision': 0.9015519,
 'recall': 0.9090465,
 'f1': 0.9052836756570761,
 'tp': 7436,
 'tn': 8179,
 'fp': 812,
 'fn': 744,
 'auc': 0.9695333951174756}

# Discard UNK and Selected Vocab
Not use UNK token and use 2457 selected words

In [23]:
dt_train = data_train.copy()
dt_val = data_val.copy()

# Load list of selected words
common_vocab = joblib.load('selected_vocab_s1.joblib')

# remove unknown words
dt_train['preprocess'] = dt_train.headline_s1.apply(lambda row: remove_unknown_token(row, common_vocab))
dt_val['preprocess'] = dt_val.headline_s1.apply(lambda row: remove_unknown_token(row, common_vocab))

list_vocab = common_vocab
print(f"Total Vocabulary: {len(list_vocab)}")

Total Vocabulary: 2457


In [24]:
# initialise weight by Glove model
embedding_weight = np.zeros((len(list_vocab), 300)) # index is use when words is not indictionary
unknown_word = dict()
for idx in tqdm(range(len(list_vocab))):
    word = list_vocab[idx]
    word_split = word.split()
    if len(word_split) == 1:
        feature, flag = get_word_features(word_split[0], embeddings_dict)
        if flag == 0: # not exist word
            unknown_word[word_split[0]] = feature
    else:
        feature = np.zeros((len(word_split), 300))
        count = 0
        for idx_w, w in enumerate(word_split):
            ft, flag = get_word_features(w, embeddings_dict)
            if flag == 0:
                if w in list(unknown_word.keys()):
                    ft = unknown_word[w]
                else:
                    unknown_word[w] = ft
            feature[idx_w] = ft
            count = count + (1-flag)
        feature[idx_w] = feature[idx_w] # + 1e-5 # avoid divided by 0    
        feature = np.mean(feature, axis=0) # equal to divided by len(word_split) or count (not recommend)
              
    embedding_weight[idx] = feature

100%|██████████| 2457/2457 [00:00<00:00, 497389.11it/s]


In [25]:
# Create Dataset data structure in Pytorch
datasetTrain = EmbeddingDataset(dt_train, col_name='preprocess', list_vocab=list_vocab)
datasetVal = EmbeddingDataset(dt_val, col_name='preprocess', list_vocab=list_vocab)

In [75]:
# Declare model and train
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, checkpoint=None, model_name='EMB_Raw_Light')

model_emb.train(numb_epoch=50)

TRAIN FROM SCRATCH
Training 0/49
Total iteration: 9
Epoch 1/50 [16122020-021736] [SAVE]
AccVal: 0.5718611693454461
AUCVal: 0.7141987661947811
Precision: 0.777479887008667
Recall: 0.14180928468704224
F1Val: 0.2398676499841472
LossVal: 0.6756773789723715
LossTrain: 0.6842303077379862
----------

Training 1/49
Total iteration: 9
Epoch 2/50 [16122020-021738] [SAVE]
AccVal: 0.663172606568833
AUCVal: 0.799472065362093
Precision: 0.8462427854537964
Recall: 0.3579462170600891
F1Val: 0.5030927672348566
LossVal: 0.6589230497678121
LossTrain: 0.6655566427442763
----------

Training 2/49
Total iteration: 9
Epoch 3/50 [16122020-021740] [SAVE]
AccVal: 0.7290938737479618
AUCVal: 0.8351411523636332
Precision: 0.830584704875946
Recall: 0.5418093204498291
F1Val: 0.6558153222405851
LossVal: 0.6399380366007487
LossTrain: 0.645358039273156
----------

Training 3/49
Total iteration: 9
Epoch 4/50 [16122020-021742] [SAVE]
AccVal: 0.7584439785697648
AUCVal: 0.8543557326697351
Precision: 0.828125
Recall: 0.6220

Epoch 31/50 [16122020-021837]
AccVal: 0.8341486140228278
AUCVal: 0.9063433728649862
Precision: 0.8381532430648804
Recall: 0.8078239560127258
F1Val: 0.8227091413153903
LossVal: 0.4416554967562358
LossTrain: 0.20218316051695082
----------

Training 31/49
Total iteration: 9
Epoch 32/50 [16122020-021839] [SAVE]
AccVal: 0.8346144887025391
AUCVal: 0.9060986565618774
Precision: 0.8390045762062073
Recall: 0.8078239560127258
F1Val: 0.8231190832605454
LossVal: 0.44562562306722003
LossTrain: 0.19903183645672268
----------

Training 32/49
Total iteration: 9
Epoch 33/50 [16122020-021841]
AccVal: 0.8332168646634055
AUCVal: 0.9058289248144507
Precision: 0.836796760559082
Recall: 0.8073349595069885
F1Val: 0.8218019214008023
LossVal: 0.44967320561408997
LossTrain: 0.19607407020197976
----------

Training 33/49
Total iteration: 9
Epoch 34/50 [16122020-021843]
AccVal: 0.831819240624272
AUCVal: 0.9055115549600188
Precision: 0.8352762460708618
Recall: 0.8058679699897766
F1Val: 0.8203085892968646
LossVal: 0

In [26]:
# Load pretrain and evaluate
dataloader = make_EmbeddingDataLoader(datasetTrain, batch_size=2048)
model_emb = ModelEmbedding(vocab_size=len(list_vocab), datasetTrain=datasetTrain, datasetVal=datasetVal,
                           init_weight=embedding_weight, 
                           batch_size=2048, optimizer_choice='adam', init_lr=0.001, layers=[300,1], weight_decay=1e-4, 
                           dropout=0.8, batchnorm=True, 
                           checkpoint='Checkpoint/EMB_Raw_Light-16122020-021734.pth.tar', 
                           model_name='EMB_Raw_Light')

model_emb.load_trained_model()
model_emb.evaluate(dataloader)

LOAD PRETRAINED MODEL AT Checkpoint/EMB_Raw_Light-16122020-021734.pth.tar


{'accuracy': 0.9309300564905946,
 'precision': 0.9268799,
 'recall': 0.9282396,
 'f1': 0.9275592350974992,
 'tp': 7593,
 'tn': 8392,
 'fp': 599,
 'fn': 587,
 'auc': 0.9815640824198283}