In [1]:
#import libraries
import pandas as pd
import numpy as np
import csv
import ast
import re
from prettytable import PrettyTable

import time
import random

#data visualization libraries
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#NLP & ML libraries
from gensim.models import Word2Vec
import gensim.downloader as api
from textblob import TextBlob
from nltk import FreqDist

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance

from scipy.sparse import save_npz, load_npz

In [2]:
#set seed so that code output is deterministic
random.seed(30)  # Set the seed for Python's random module
np.random.seed(30)  # Set the seed for NumPy's random module

In [3]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('Data/training_corpus.csv', converters ={'tokens':list_converter})

In [4]:
data = data.drop(columns = ['index'])
print (data.shape)
data.head()

(99186, 7)


Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
0,comment,gtfo2hl,2021,"*Cuntry roads, take me hoem*",cuntry roads hoem,3,"[cuntry, road, hoem]"
1,comment,gtfqkbv,2021,"That’s been there for several years, sent a pi...",years sent pic cuntry friend long time ago,8,"[year, send, pic, cuntry, friend, long, time, ..."
2,comment,gtfou07,2021,I am single and I have not traveled to any cun...,single traveled cuntry past year,5,"[single, travel, cuntry, past, year]"
3,comment,gtfrgpe,2021,What happens when you shop at dragon mart...,happens shop dragon mart,4,"[happen, shop, dragon, mart]"
4,comment,gthiiwi,2021,"That’s just absolutely hilarious, is this in t...",absolutely hilarious springs souk,4,"[absolutely, hilarious, spring, souk]"


## **Word2Vec Model** 

In [5]:
#convert df['tokens'] to list of strings for word2vec model
corpus = data['tokens'].tolist()#.apply(lambda token: ' '.join(token)).tolist()

corpus[:5]

[['cuntry', 'road', 'hoem'],
 ['year', 'send', 'pic', 'cuntry', 'friend', 'long', 'time', 'ago'],
 ['single', 'travel', 'cuntry', 'past', 'year'],
 ['happen', 'shop', 'dragon', 'mart'],
 ['absolutely', 'hilarious', 'spring', 'souk']]

In [6]:
#start_time = time.time() #track start time of execution

#code from codebasics and semicolon, both on youtube

#instantiate word2vec model
word2vec = Word2Vec (window = 5, min_count = 2, workers = 4)
   
#build a vocabulary
word2vec.build_vocab(corpus, progress_per = 1000)



In [7]:
word2vec.corpus_count

99186

In [8]:
#train the word2vec model

word2vec.train(
                corpus,
                total_examples = word2vec.corpus_count,
                epochs = word2vec.epochs
)

(6609385, 7045680)

In [9]:
word2vec.wv.most_similar ('lulu')

[('nesto', 0.9561541080474854),
 ('cafeteria', 0.9483346343040466),
 ('waitrose', 0.9481028318405151),
 ('veggie', 0.9419034719467163),
 ('spinney', 0.9393037557601929),
 ('carrefour', 0.9377502202987671),
 ('hypermarket', 0.9365057349205017),
 ('fish', 0.9346320033073425),
 ('fruit', 0.9335165023803711),
 ('vegetable', 0.932704746723175)]

In [10]:
word2vec.wv.most_similar(positive =['king', 'woman'], negative=['man'], topn=3)

[('palace', 0.8755813837051392),
 ('eg', 0.867764949798584),
 ('madinah', 0.8672471046447754)]

In [11]:
word2vec.wv.doesnt_match(['ejari', 'rera', 'dewa', 'drive'])

'drive'

## **Word2Vec Model** with custom stopwords removed from corpus

In [12]:
#created the filtered corpus from another notebook and imported it here for use
# Read the file
with open('updated_corpus.txt', 'r') as f:
    updated_data = f.readlines()

# Remove newline characters
updated_data = [line.strip() for line in updated_data]


In [13]:
print (f'updated_data is a {type(updated_data)}, and the first five elements are:\n',updated_data[:5])

updated_data is a <class 'list'>, and the first five elements are:
 ['road', '', 'single travel past', 'shop dragon mart', 'hilarious spring souk']


In [14]:
#convert list of strings to dataframe column with list of tokens on each row
from gensim.utils import tokenize

updated_data = pd.DataFrame(updated_data, columns = ['tokens'])
updated_data = updated_data['tokens'].apply(lambda x: list(tokenize(x)))

In [15]:
print(type(updated_data))
updated_data.head()

<class 'pandas.core.series.Series'>


0                       [road]
1                           []
2       [single, travel, past]
3         [shop, dragon, mart]
4    [hilarious, spring, souk]
Name: tokens, dtype: object

In [16]:
#convert ['tokens'] column to list of strings for word2vec model
updated_corpus = updated_data.tolist()#.apply(lambda token: ' '.join(token)).tolist()

updated_corpus[:5]

[['road'],
 [],
 ['single', 'travel', 'past'],
 ['shop', 'dragon', 'mart'],
 ['hilarious', 'spring', 'souk']]

In [17]:
#start_time = time.time() #track start time of execution

#code from codebasics and semicolon, both on youtube

#instantiate word2vec model
word2vec2 = Word2Vec (window = 5, min_count = 2, workers = 4)
   
#build a vocabulary
word2vec2.build_vocab(corpus, progress_per = 1000)



In [18]:
#number of sentences in vocabulary
word2vec2.corpus_count

99186

In [19]:
#train the word2vec model

word2vec2.train(
                updated_corpus,
                total_examples = word2vec2.corpus_count,
                epochs = word2vec2.epochs
)

(4198495, 4198495)

In [20]:
word2vec2.wv.most_similar ('lulu')

[('waitrose', 0.9908475875854492),
 ('spinney', 0.9897341728210449),
 ('hypermarket', 0.9820606112480164),
 ('mcdonalds', 0.980934739112854),
 ('carrefour', 0.9793255925178528),
 ('nesto', 0.9780416488647461),
 ('strawberry', 0.9738538265228271),
 ('coop', 0.9728703498840332),
 ('wholesale', 0.9718944430351257),
 ('vegetable', 0.969465434551239)]

## **Word2Vec with bigrams**

In [21]:
#first model on full corpus

#code from gensim documentation
from gensim.models.phrases import Phrases

#train bigram detector
bigram_transform = Phrases (corpus, min_count = 1) #default threshold is 10.0, fewer phrases will be created



In [22]:
#check performance of bigram transform model
new_sentence = corpus[330]
print(bigram_transform[new_sentence])

['anymore', 'new', 'labor_law', 'common', 'blatant', 'good', 'company', 'care', 'sale', 'promotional', 'bdm', 'post', 'hire', 'look', 'filter', 'photo_cv', 'rife', 'real_estate', 'industry', 'say', 'uae', 'young', 'country', 'learn', 'motivation', 'law', 'change', 'align', 'progressive', 'practice', 'ahead', 'way', 'nature', 'place', 'live']


In [23]:
#apply bi-gram transform on corpus for word2vec model

bigram_corpus = [bigram_transform[sentence] for sentence in corpus]

bigram_word2vec = Word2Vec (bigram_corpus,window = 5, min_count = 2, workers = 4)

In [24]:
bigram_word2vec.wv.most_similar('lgbt')

[('tolerant', 0.9880362749099731),
 ('lgbtq', 0.9879887104034424),
 ('respectful', 0.9858818650245667),
 ('criticize', 0.982750415802002),
 ('proud', 0.9826979637145996),
 ('rape', 0.9824931025505066),
 ('feminism', 0.9823785424232483),
 ('defend', 0.9820605516433716),
 ('liberal', 0.981878936290741),
 ('attack', 0.9818695187568665)]

In [25]:
#second model on filtered corpus --- additonal stopwords removed (informal conversation common words)

#train bigram detector
bigram_transform2 = Phrases (updated_corpus, min_count = 1)

In [26]:
#check performance of bigram transform model
new_sentence = corpus[330]
print(bigram_transform[new_sentence])

['anymore', 'new', 'labor_law', 'common', 'blatant', 'good', 'company', 'care', 'sale', 'promotional', 'bdm', 'post', 'hire', 'look', 'filter', 'photo_cv', 'rife', 'real_estate', 'industry', 'say', 'uae', 'young', 'country', 'learn', 'motivation', 'law', 'change', 'align', 'progressive', 'practice', 'ahead', 'way', 'nature', 'place', 'live']


In [27]:
#apply bi-gram transform on corpus for word2vec model

bigram_corpus2 = [bigram_transform2[sentence] for sentence in corpus]

bigram_word2vec2 = Word2Vec (bigram_corpus2,window = 5, min_count = 2, workers = 4)

In [28]:
bigram_word2vec2.wv.most_similar('lgbt')

[('lgbtq', 0.9786578416824341),
 ('intolerant', 0.9772281646728516),
 ('freedom_speech', 0.9766908288002014),
 ('hypocritical', 0.9748409986495972),
 ('tolerate', 0.974479615688324),
 ('agenda', 0.9733495712280273),
 ('violence', 0.9730541706085205),
 ('brainwash', 0.9726577997207642),
 ('extremist', 0.9689884781837463),
 ('anti_muslim', 0.9688097834587097)]

## **Word2Vec: Transfer Learning with wiki**

In [34]:
from gensim.models import KeyedVectors
pretrained_model = KeyedVectors.load("/pretrained_models/glove-wiki_gigaword-100")

FileNotFoundError: [Errno 2] No such file or directory: '/pretrained_models/glove-wiki_gigaword-100'

In [32]:
pretrained_model.most_similar('lgbt')

[('transgender', 0.8725160956382751),
 ('lesbian', 0.8088030815124512),
 ('bisexual', 0.7536906003952026),
 ('gay', 0.6802830696105957),
 ('glbt', 0.6522399187088013),
 ('lgbtq', 0.6519015431404114),
 ('transgendered', 0.6488721966743469),
 ('lesbians', 0.6255646347999573),
 ('advocacy', 0.6097405552864075),
 ('feminist', 0.568495512008667)]

## **CREATE DOCUMENT VECTORS FROM WORD EMBEDDINGS**