# Local

In [None]:
root = ""

# Connect To Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root = "drive/MyDrive/CMPLabData/Lexicon/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import csv
import random
import pickle
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
# from tensorflow.keras.utils import plot_model

# Import Words and Nonwords Data

In [None]:
words = []
frq = []
sum_frq = 0
with open(root + "Datasets/Items.csv", 'r') as file:
    reader = csv.reader(file)
    for idx, row in enumerate(reader):
        if idx ==0: 
            continue
        words.append([row[0], int(row[2].replace(',',''))])
        frq.append(int(row[2].replace(',','')))
        sum_frq += int(row[2].replace(',',''))
        
words.sort(key = lambda x: x[1], reverse=True)

all_words = [word[0] for word in words]
hf_words = all_words[:int(len(words)/2)]
lf_words = all_words[int(len(words)/2):]
# Another metric for defining hf an lf is good to be tested (something other than half of the words are hf
# and other half is lf)

In [None]:
def categorise_by_log_freq(row, freq_rate=6):
    if row['label'] == 0:
        return 'NW'
    elif row['label'] == 1 and row['log_freq'] > freq_rate:
        return 'HF'
    elif row['label'] == 1 and (row['log_freq'] >= 0 and row['log_freq'] <=freq_rate):
        return 'LF'

In [None]:
with open(root+'Datasets/words.pkl', 'rb') as f:
  words = pickle.load(f)

with open(root+'Datasets/nonwords.pkl', 'rb') as f:
  nonwords = pickle.load(f)

In [None]:
for w in words:
    w.insert(3, [1, 0])
    w.insert(4, 1)

for nw in nonwords:
    nw.insert(1, 0)
    nw.insert(3, [0, 1])
    nw.insert(4, 0)

In [None]:
word_df = pd.DataFrame(words + nonwords,
               columns =['string', 'freq', 'represention', 'code', 'label'])
word_df['freq'] = word_df['freq'].astype('int')
word_df['log_freq'] = np.round(np.log(word_df['freq']+1))

# First HF, LF, NW categorization
category = []
for index, row in tqdm(word_df.iterrows()):
    if row['label']==0:
        category.append("NW")
    elif row[0] in hf_words:
        category.append("HF")
    elif row[0] in lf_words:
        category.append("LF")
word_df['category'] = category

# Alternative HF, LF, NW categorization 
word_df['category_a'] = word_df.apply(lambda row: categorise_by_log_freq(row, 8), axis=1)

word_df

80822it [00:47, 1698.68it/s] 


Unnamed: 0,string,freq,represention,code,label,log_freq,category,category_a
0,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
1,aah,222,"[0.08188029, -0.12485198, -0.3285692, 0.374986...","[1, 0]",1,5.0,LF,LF
2,Aaron,10806,"[0.12105307, 0.058084395, -0.013443807, 0.1779...","[1, 0]",1,9.0,HF,HF
3,aback,387,"[-0.11328265, -0.23398337, -0.58877015, 0.2669...","[1, 0]",1,6.0,LF,LF
4,abacus,513,"[0.19533455, 0.10326072, -0.19377021, 0.234316...","[1, 0]",1,6.0,HF,LF
...,...,...,...,...,...,...,...,...
80817,declassificarion,0,"[-0.1627126, -0.21505767, 0.2029604, 0.0559478...","[0, 1]",0,0.0,NW,NW
80818,antifundamentadast,0,"[-0.09716947, -0.056558426, -0.18841882, -0.04...","[0, 1]",0,0.0,NW,NW
80819,transcontanental,0,"[0.22957236, -0.21539333, -0.027639901, 0.1270...","[0, 1]",0,0.0,NW,NW
80820,iv,0,"[-0.42683536, -0.50000775, -0.16295096, -0.305...","[0, 1]",0,0.0,NW,NW


In [None]:
HF_LF_df = word_df.loc[word_df['log_freq']!=0]
duplicates = []
for index, row in tqdm(HF_LF_df.iterrows()):
  if(row['category_a']=='HF'):
    for i in range(int(row['log_freq'])):
        duplicates.append(row)
  # else:
  #   if row[1] > 1:
  #     duplicates.append(row)
duplicates_df = pd.DataFrame(duplicates).reset_index(drop=True)

39965it [00:02, 15567.44it/s]


In [None]:
duplicates_df

Unnamed: 0,string,freq,represention,code,label,log_freq,category,category_a
0,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
1,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
2,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
3,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
4,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
...,...,...,...,...,...,...,...,...
63116,zoom,4920,"[-0.36649638, 0.68561196, 0.1494862, -0.219817...","[1, 0]",1,9.0,HF,HF
63117,zoom,4920,"[-0.36649638, 0.68561196, 0.1494862, -0.219817...","[1, 0]",1,9.0,HF,HF
63118,zoom,4920,"[-0.36649638, 0.68561196, 0.1494862, -0.219817...","[1, 0]",1,9.0,HF,HF
63119,zoom,4920,"[-0.36649638, 0.68561196, 0.1494862, -0.219817...","[1, 0]",1,9.0,HF,HF


## Concat

In [None]:
# mode_space_df = pd.concat([word_df, duplicates_df]).sample(frac=1.0).reset_index(drop=True)
mode_space_df = word_df
mode_space_df

Unnamed: 0,string,freq,represention,code,label,log_freq,category,category_a
0,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF
1,aah,222,"[0.08188029, -0.12485198, -0.3285692, 0.374986...","[1, 0]",1,5.0,LF,LF
2,Aaron,10806,"[0.12105307, 0.058084395, -0.013443807, 0.1779...","[1, 0]",1,9.0,HF,HF
3,aback,387,"[-0.11328265, -0.23398337, -0.58877015, 0.2669...","[1, 0]",1,6.0,LF,LF
4,abacus,513,"[0.19533455, 0.10326072, -0.19377021, 0.234316...","[1, 0]",1,6.0,HF,LF
...,...,...,...,...,...,...,...,...
80817,declassificarion,0,"[-0.1627126, -0.21505767, 0.2029604, 0.0559478...","[0, 1]",0,0.0,NW,NW
80818,antifundamentadast,0,"[-0.09716947, -0.056558426, -0.18841882, -0.04...","[0, 1]",0,0.0,NW,NW
80819,transcontanental,0,"[0.22957236, -0.21539333, -0.027639901, 0.1270...","[0, 1]",0,0.0,NW,NW
80820,iv,0,"[-0.42683536, -0.50000775, -0.16295096, -0.305...","[0, 1]",0,0.0,NW,NW


In [None]:
train_dataframe = mode_space_df.iloc[:int(len(mode_space_df)*0.8), :] 
valid_dataframe = mode_space_df.iloc[int(len(mode_space_df)*0.8):, :] 

# Nueral Network

In [None]:
# Sequential Model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Input(shape=(300,)))
model.add(layers.Dense(300, activation="relu", name="dense1"))
model.add(layers.Dense(10, activation="relu", name="dense4"))
model.add(layers.Dense(2, activation='softmax', name='clf'))

opt = tf.keras.optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=opt,
              loss=tf.keras.losses.categorical_crossentropy,
              metrics=['accuracy'])

model.summary()
# plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense1 (Dense)              (None, 300)               90300     
                                                                 
 dense4 (Dense)              (None, 10)                3010      
                                                                 
 clf (Dense)                 (None, 2)                 22        
                                                                 
Total params: 93,332
Trainable params: 93,332
Non-trainable params: 0
_________________________________________________________________


  super(SGD, self).__init__(name, **kwargs)


In [None]:
history = model.fit(
    x=np.array(list(train_dataframe['represention'])),
    y=np.array(list(train_dataframe['code'])),
    validation_data=(np.array(list(valid_dataframe['represention'])), np.array(list(valid_dataframe['code']))),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
    ],
    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_preds = model.predict(np.array(list(train_dataframe['represention'])))

In [None]:
y_preds.shape

(64657, 2)

In [None]:
y_preds

array([[0.9062808 , 0.09371914],
       [0.6021876 , 0.3978125 ],
       [0.8470599 , 0.1529401 ],
       ...,
       [0.01744834, 0.9825516 ],
       [0.00530944, 0.9946905 ],
       [0.01771074, 0.98228925]], dtype=float32)

In [None]:
print(classification_report(np.argmax(np.array(list(train_dataframe['code'])), axis=1), np.argmax(y_preds, axis=1)))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94     40481
           1       0.89      0.91      0.90     24176

    accuracy                           0.92     64657
   macro avg       0.92      0.92      0.92     64657
weighted avg       0.92      0.92      0.92     64657



In [None]:
model.save(root+"check_points/NN_model")

INFO:tensorflow:Assets written to: drive/MyDrive/CMPLabData/Lexicon/check_points/NN_model/assets


# Check Probabilities

In [None]:
full_dataset_predictions = model.predict(np.array(list(word_df['represention'])))

In [None]:
full_dataset_predictions.shape

(80822, 2)

In [None]:
word_df['word_prob']=full_dataset_predictions[:,0]
word_df['nword_prob']=full_dataset_predictions[:,1]
word_df['logit']=np.log(word_df['word_prob']/word_df['nword_prob'])

In [None]:
word_df

Unnamed: 0,string,freq,represention,code,label,log_freq,category,category_a,word_prob,nword_prob,logit
0,a,10610626,"[0.11558813, 0.30192456, -0.114647746, 0.01000...","[1, 0]",1,16.0,HF,HF,0.906281,0.093719,2.269047
1,aah,222,"[0.08188029, -0.12485198, -0.3285692, 0.374986...","[1, 0]",1,5.0,LF,LF,0.602188,0.397812,0.414588
2,Aaron,10806,"[0.12105307, 0.058084395, -0.013443807, 0.1779...","[1, 0]",1,9.0,HF,HF,0.847060,0.152940,1.711725
3,aback,387,"[-0.11328265, -0.23398337, -0.58877015, 0.2669...","[1, 0]",1,6.0,LF,LF,0.928595,0.071405,2.565301
4,abacus,513,"[0.19533455, 0.10326072, -0.19377021, 0.234316...","[1, 0]",1,6.0,HF,LF,0.967652,0.032348,3.398307
...,...,...,...,...,...,...,...,...,...,...,...
80817,declassificarion,0,"[-0.1627126, -0.21505767, 0.2029604, 0.0559478...","[0, 1]",0,0.0,NW,NW,0.207975,0.792025,-1.337176
80818,antifundamentadast,0,"[-0.09716947, -0.056558426, -0.18841882, -0.04...","[0, 1]",0,0.0,NW,NW,0.008859,0.991141,-4.717467
80819,transcontanental,0,"[0.22957236, -0.21539333, -0.027639901, 0.1270...","[0, 1]",0,0.0,NW,NW,0.032687,0.967313,-3.387558
80820,iv,0,"[-0.42683536, -0.50000775, -0.16295096, -0.305...","[0, 1]",0,0.0,NW,NW,0.847766,0.152234,1.717187


In [None]:
word_df.groupby(['category']).agg({'word_prob': ['mean', 'std', 'count'], 'nword_prob': ['mean', 'std', 'count'],
                                  'logit': ['mean', 'std', 'count']})

Unnamed: 0_level_0,word_prob,word_prob,word_prob,nword_prob,nword_prob,nword_prob,logit,logit,logit
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
HF,0.907862,0.164081,20240,0.092138,0.164081,20240,3.46172,1.892805,20240
LF,0.840264,0.218612,20241,0.159736,0.218612,20241,2.633909,2.011414,20241
NW,0.154113,0.233773,40341,0.845887,0.233773,40341,-2.912167,2.297908,40341


In [None]:
word_df.groupby(['category_a']).agg({'word_prob': ['mean', 'std', 'count'], 'nword_prob': ['mean', 'std', 'count'],
                                  'logit': ['mean', 'std', 'count']})

Unnamed: 0_level_0,word_prob,word_prob,word_prob,nword_prob,nword_prob,nword_prob,logit,logit,logit
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
category_a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
HF,0.915882,0.153521,6423,0.084118,0.153521,6423,3.573848,1.885983,6423
LF,0.866175,0.202294,34058,0.133825,0.202294,34058,2.948598,2.001094,34058
NW,0.154113,0.233773,40341,0.845887,0.233773,40341,-2.912167,2.297908,40341


In [None]:
word_df_to_save = word_df.drop(['represention', 'code','category'], axis=1)
word_df_to_save = word_df_to_save.rename(columns={"category_a": "category", "B": "c"})

In [None]:
word_df_to_save.to_csv(root+"Datasets/full_dataset_prob_a.csv", header=0, index=False)

In [None]:
word_df_to_save

Unnamed: 0,string,freq,label,log_freq,category,word_prob,nword_prob,logit
0,a,10610626,1,16.0,HF,0.906281,0.093719,2.269047
1,aah,222,1,5.0,LF,0.602188,0.397812,0.414588
2,Aaron,10806,1,9.0,HF,0.847060,0.152940,1.711725
3,aback,387,1,6.0,LF,0.928595,0.071405,2.565301
4,abacus,513,1,6.0,LF,0.967652,0.032348,3.398307
...,...,...,...,...,...,...,...,...
80817,declassificarion,0,0,0.0,NW,0.207975,0.792025,-1.337176
80818,antifundamentadast,0,0,0.0,NW,0.008859,0.991141,-4.717467
80819,transcontanental,0,0,0.0,NW,0.032687,0.967313,-3.387558
80820,iv,0,0,0.0,NW,0.847766,0.152234,1.717187
