# Import the following libraries

In [None]:
import os, sys, pandas as pd, re, numpy as np, string, nltk, tqdm, functools, scipy, time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from collections import Counter, defaultdict
from functools import reduce
from sklearn.decomposition import PCA, TruncatedSVD
from scipy.sparse import csr_matrix
import utils, torch_model_base, torch_autoencoder
from torch_autoencoder import TorchAutoencoder
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping

# Import your dataset

In [None]:
data = pd.read_csv('your file')

## In case you want to take a random sample of the above file

In [None]:
data_mini = data.sample(frac=.01, replace=True, random_state=1)


In [None]:
data.info()

In [None]:
df.head(5)

In [None]:
df.comment=df.comment.str.lower()
df.head()

In [None]:
df['comment']=df['comment'].apply(lambda x: re.sub(r'[^a-zA-Z]+',' ',str(x)))
df.head()

In [None]:
new_stopwords=["xxxx","xxxxxxxx","xx","xxxx.","xxxxxxxx.","xx."]
df.comment=df['comment'].apply(lambda x:' '.join(
    [word for word in x.split() if word not in (new_stopwords)]))
df.head()

In [None]:
df.comment = df['comment'].str.replace('xxxx', '')
df.head()

# Create co-occurrence matrix

In [None]:
df_list=df.comment.values.tolist()


In [None]:
vect = CountVectorizer(stop_words=None, token_pattern=r"(?u)\b\w+\b")
X = vect.fit_transform(df_list)
uniq_wrds = vect.get_feature_names_out()
uniq_wrds = uniq_wrds.tolist()
n = len(uniq_wrds)
co_mat = np.zeros((n,n))
print(n)

In [None]:
window_len = 10

def update_co_mat(x):   
    # Get all the words in the sentence and store it in an array wrd_lst
    wrd_list = x.split(' ')
    wrd_list = [ele for ele in wrd_list if ele.strip()]

    
    # Consider each word as a focus or center word
    for focus_wrd_indx, focus_wrd in enumerate(wrd_list):
        focus_wrd = focus_wrd.lower()
        # Get the indices of all the context words, based on the window size, for the given focus word
        for contxt_wrd_indx in range((max(0,focus_wrd_indx - window_len)),(min(len(wrd_list),focus_wrd_indx + window_len +1))):                        
            # If context words are in the unique words list
            if wrd_list[contxt_wrd_indx] in uniq_wrds:
                
                # To identify the row number, get the index of the focus_wrd in the uniq_wrds list
                co_mat_row_indx = uniq_wrds.index(focus_wrd)
                
                # To identify the column number, get the index of the context words in the uniq_wrds list
                co_mat_col_indx = uniq_wrds.index(wrd_list[contxt_wrd_indx])
                
                # To calculate the scaled value
                if abs(focus_wrd_indx-contxt_wrd_indx)==0:
                    scale=1
                else:
                    #scale=(window_len-abs(focus_wrd_indx-contxt_wrd_indx)+1)/window_len
                    scale=1
                                
                # Update the respective columns of the corresponding focus word row
                co_mat[co_mat_row_indx][co_mat_col_indx] += scale


In [None]:
for sentence in tqdm(df_list):
    update_co_mat(sentence)

In [None]:
print(co_mat[0])

In [None]:
df_cm=pd.DataFrame(co_mat, columns=uniq_wrds, index=uniq_wrds)
df_cm.to_csv('save to your location')

In [None]:
df_cm.shape

In [None]:
df_cm.head()

In [None]:
#Create a dataframe of unique words
df_uniq_wrds=pd.DataFrame(uniq_wrds,columns=['word'])
df_uniq_wrds.to_csv('save to your location')

In [None]:
df_uniq_wrds.head()

# Find the most closest related words to any word in the vocabulary

In [None]:
# Find neighbors using cosine or euclidean distance
def euclidean(w1, w2):
    return scipy.spatial.distance.euclidean(w1, w2)

def cosine(w1, w2):
    return scipy.spatial.distance.cosine(w1, w2)

def neighbors(word, df, distfunc=cosine):

    if word not in df.index:
        raise ValueError('{} is not in this Vector Space'.format(word))
    w = df.loc[word]
    dists = df.apply(lambda x: distfunc(w, x), axis=1)
    return dists.sort_values()

In [None]:
print(neighbors('credit', df_cm, distfunc=cosine).iloc[1:6])

# Re-weight the co-occurrence matrix using PMI

In [None]:
#...Calcualte positive PMI ...
def observed_over_expected(df):
    col_totals = df.sum(axis=0)
    total = col_totals.sum()
    row_totals = df.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    oe = df / expected
    return oe

def pmi(df, positive=True):
    df = observed_over_expected(df)
    with np.errstate(divide='ignore'):
        df = np.log(df)
    df[np.isinf(df)] = 0.0  
    if positive:
        df[df < 0] = 0.0
    return df

wghtd_df=pmi(df_cm)

In [None]:
wghtd_df.head()

Find the most closest related words to any word in the vocabulary in the reweighted matrix

In [None]:
print(neighbors('credit', wghtd_df, distfunc=cosine).iloc[1:6])

# Use LSA to reduce dimensonality

In [None]:
#Reduce dimensionality using LSA (Latent Semantic Analysis)
M_dense=df_cm.to_numpy()
M=csr_matrix(M_dense)
lsa = TruncatedSVD(n_components=500, n_iter=50, random_state=42)
M_lsa=lsa.fit_transform(M)
df_lsa=pd.DataFrame(M_lsa, index=uniq_wrds)

Find the most closest related words to any word in the vocabulary in the LSA matrix

In [None]:
print(neighbors('credit', df_lsa, distfunc=cosine).iloc[1:6])

# Find the most closest related words to any word in the vocabulary using all 3 matrices we have created so far

In [None]:
word_lookup = 'mortgage'
print(f'Top 7 Neighbors using Co-occurrence matrix: \n----------------------\n')
print(neighbors(word_lookup, df_cm, distfunc=cosine).iloc[1:8])

print(f'Top 7 Neighbors using Weighted Co-occurrence matrix: \n----------------------\n')
print(neighbors(word_lookup, wghtd_df, distfunc=cosine).iloc[1:8])

print(f'Top 7 Neighbors using LSA or reduced dimensionality of Co-occurence matrix: \n----------------------\n')
print(neighbors(word_lookup, df_lsa, distfunc=cosine).iloc[1:8])


# Use Autoencoders to further reduce dimensonality

In [None]:
#Reduce dimensionality of co-occurence matris using LSA and Autoencoders
df_lsa_ae = TorchAutoencoder(max_iter=1000, hidden_dim=300, eta=0.01).fit(df_lsa)

In [None]:
df_lsa_ae.shape

Find the most closest related words to any word in the vocabulary in the LSA matrix

In [None]:
print(neighbors('credit', df_lsa_ae, distfunc=cosine).iloc[1:6])

Find the most closest related words to any word in the vocabulary using all 4 matrices we have created so far

In [None]:
lookup_word='home'

print(f'Top 5 Neighbors using Co-occurrence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_cm, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using Weighted Co-occurrence matrix: \n----------------------\n')
print(neighbors(lookup_word, wghtd_df, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using LSA or reduced dimensionality of Co-occurence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_lsa, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using LSA+Autoencoders or reduced dimensionality of Co-occurence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_lsa_ae, distfunc=cosine).iloc[1:6])

# Snippet that takes two words and gives you the cosine distance

In [None]:
a='bank'
b='loan'

w_a=df_lsa_ae.loc[a]
w_b=df_lsa_ae.loc[b]

dist_ab=cosine(w_a, w_b)
print("The distance between '{}' and '{}' is: {}".format(a, b, dist_ab))


# Snippet that takes a word and finds the most related word from the final matrix

In [None]:
def find_2nd_closest(input_array, embeddings_df):
    input_array = np.array(input_array).reshape(1, -1)
    word_embeddings = np.array(embeddings_df.values)
    similarities = cosine_similarity(input_array, word_embeddings)
    closest_index = np.argsort(similarities[0])[-2] 
    return closest_index

input_word = 'fees'
input_emb = df_lsa_ae.loc[input_word]

closest_index = find_2nd_closest(input_emb, df_lsa_ae)
closest_word=df_lsa_ae.index[closest_index]

print("The nearest neighbor for '{}' is: {}".format(input_word, closest_word))

# Combine any two word vectors to see what will be the nearest word of the combination

In [None]:
def find_2nd_closest3(input_array, embeddings_df):
    input_array = np.array(input_array).reshape(1, -1)
    word_embeddings = np.array(embeddings_df.values)
    similarities = cosine_similarity(input_array, word_embeddings)
    closest_index = np.argsort(similarities[0])[-1] 
    return closest_index

a1='checking'
arr1=df_lsa_ae.loc[a1]
a2='fees'
arr2=df_lsa_ae.loc[a2]
w_comb=(np.multiply(arr1, arr2))/2

closest_index = find_2nd_closest3(w_comb, df_lsa_ae)
closest_word=df_lsa_ae.index[closest_index]

print("If you combine '{}' and '{}' then you get: '{}'".format(a1, a2, closest_word))

# Create embeddings using LSTM 

In [None]:
dflen=len(df)
num_comments = dflen
max_features = num_comments 
embedding_dim = 300  #controls the dimensionality
window_size = 7 # controls the window size


In [None]:
comments = df['comment'].tolist()
texts = comments

In [None]:
# Tokenize text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [None]:
%%time
context = []
target = []
for sequence in sequences:
    for i in range(window_size, len(sequence) - window_size):
        context.append(sequence[i-window_size:i] + sequence[i+1:i+window_size+1])
        target.append(sequence[i])

In [None]:
context = pad_sequences(context, maxlen=window_size*2)
target = np.array(target)

In [None]:
# Define model
model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=window_size*2))
model.add(LSTM(300))
model.add(Dense(max_features, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
callback = keras.callbacks.EarlyStopping(monitor='loss', patience=2)

In [None]:
model.fit(context, target, epochs=15, batch_size=200, callbacks=[callback])

In [None]:
lstm_embeddings = model.get_weights()[0]

In [None]:
lstm_embeddings.shape

In [None]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
lstm_embeddings_df = pd.DataFrame(columns=['word'] + [f'embedding_{i+1}' for i in range(embedding_dim)])
for i in tqdm(range(1, max_features)):
    word = reverse_word_index[i]
    embedding = lstm_embeddings[i]
    row = pd.Series([word] + list(embedding), index=lstm_embeddings_df.columns)
    lstm_embeddings_df = lstm_embeddings_df.append(row, ignore_index=True)

In [None]:
# Save DataFrame to CSV file
lstm_embeddings_df.to_csv('/Users/avi_patel/Documents/yt_project_3K_lstm_embeddings.csv', index=False) 

In [None]:
lstm_embeddings_df2=lstm_embeddings_df.set_index(list(lstm_embeddings_df)[0])

In [None]:
lstm_embeddings_df2.head()

# # Find the most closest related words to any word in the vocabulary using all  the techniques, including LSTM 


In [None]:
lookup_word='credit'

print(f'Top 5 Neighbors using Co-occurrence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_cm, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using Weighted Co-occurrence matrix: \n----------------------\n')
print(neighbors(lookup_word, wghtd_df, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using LSA or reduced dimensionality of Co-occurence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_lsa, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using LSA+Autoencoders or reduced dimensionality of Co-occurence matrix: \n----------------------\n')
print(neighbors(lookup_word, df_lsa_ae, distfunc=cosine).iloc[1:6])

print(f'Top 5 Neighbors using LSTM embeddings: \n----------------------\n')
print(neighbors(lookup_word, lstm_embeddings_df2, distfunc=cosine).iloc[1:6])

In [None]:
def find_2nd_closest3(input_array, embeddings_df):
    input_array = np.array(input_array).reshape(1, -1)
    word_embeddings = np.array(embeddings_df.values)
    similarities = cosine_similarity(input_array, word_embeddings)
    closest_index = np.argsort(similarities[0])[-1] 
    return closest_index

a1='payment' #is late
arr1=lstm_embeddings_df2.loc[a1]
z1=arr1.array
a2='never' #fees fees
arr2=lstm_embeddings_df2.loc[a2]
z2=arr2.array
a3='posted' #charged penalty
arr3=lstm_embeddings_df2.loc[a3]
z3=arr3.array
dott=np.multiply.reduce((z1, z2, z3))

closest_index = find_2nd_closest3(dott, lstm_embeddings_df2)
closest_word=lstm_embeddings_df2.index[closest_index]

print(closest_word)
print("If you combine '{}' and '{}' and '{}' then you get: '{}'".format(a1, a2, a3, closest_word))