In [2]:
#import libraries
import pandas as pd
import numpy as np
import csv
import ast
import re
from prettytable import PrettyTable

import time
import random

#data visualization libraries
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#NLP & ML libraries
from gensim.models import Word2Vec
from textblob import TextBlob
from nltk import FreqDist

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from scipy.sparse import save_npz, load_npz

In [3]:
#set seed so that code output is deterministic
random.seed(30)  # Set the seed for Python's random module
np.random.seed(30)  # Set the seed for NumPy's random module

In [4]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('Data/training_corpus.csv', converters ={'tokens':list_converter})

In [5]:
data = data.drop(columns = ['index'])
print (data.shape)
data.head()

(99186, 7)


Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
0,comment,gtfo2hl,2021,"*Cuntry roads, take me hoem*",cuntry roads hoem,3,"[cuntry, road, hoem]"
1,comment,gtfqkbv,2021,"That’s been there for several years, sent a pi...",years sent pic cuntry friend long time ago,8,"[year, send, pic, cuntry, friend, long, time, ..."
2,comment,gtfou07,2021,I am single and I have not traveled to any cun...,single traveled cuntry past year,5,"[single, travel, cuntry, past, year]"
3,comment,gtfrgpe,2021,What happens when you shop at dragon mart...,happens shop dragon mart,4,"[happen, shop, dragon, mart]"
4,comment,gthiiwi,2021,"That’s just absolutely hilarious, is this in t...",absolutely hilarious springs souk,4,"[absolutely, hilarious, spring, souk]"


## **Word2Vec Model** 

In [54]:
#convert df['tokens'] to list of strings for word2vec model
corpus = data['tokens'].tolist()#.apply(lambda token: ' '.join(token)).tolist()

corpus[:5]

[['cuntry', 'road', 'hoem'],
 ['year', 'send', 'pic', 'cuntry', 'friend', 'long', 'time', 'ago'],
 ['single', 'travel', 'cuntry', 'past', 'year'],
 ['happen', 'shop', 'dragon', 'mart'],
 ['absolutely', 'hilarious', 'spring', 'souk']]

In [55]:
#start_time = time.time() #track start time of execution

#code from codebasics and semicolon, both on youtube

#instantiate word2vec model
word2vec = Word2Vec (window = 5, min_count = 2, workers = 4)
   
#build a vocabulary
word2vec.build_vocab(corpus, progress_per = 1000)



In [60]:
word2vec.corpus_count

99186

In [57]:
#train the word2vec model

word2vec.train(
                corpus,
                total_examples = word2vec.corpus_count,
                epochs = word2vec.epochs
)

(6609977, 7045680)

In [156]:
word2vec.wv.most_similar ('lulu')

[('waitrose', 0.9512330293655396),
 ('nesto', 0.9506546258926392),
 ('vegetable', 0.9461355805397034),
 ('spinneys', 0.9418869614601135),
 ('cafeteria', 0.9361352324485779),
 ('garlic', 0.935577929019928),
 ('spinney', 0.9343827962875366),
 ('carton', 0.9331820607185364),
 ('veggie', 0.9312180280685425),
 ('biryani', 0.9304446578025818)]

## **Word2Vec Model** with custom stopwords removed from corpus

In [97]:
#created the filtered corpus from another notebook and imported it here for use
# Read the file
with open('updated_corpus.txt', 'r') as f:
    updated_data = f.readlines()

# Remove newline characters
updated_data = [line.strip() for line in updated_data]


In [102]:
print (f'updated_data is a {type(updated_data)}, and the first five elements are:\n',updated_data[:5])

updated_data is a <class 'list'>, and the first five elements are:
 ['road', '', 'single travel past', 'shop dragon mart', 'hilarious spring souk']


In [103]:
#convert list of strings to dataframe column with list of tokens on each row
from gensim.utils import tokenize

updated_data = pd.DataFrame(updated_data, columns = ['tokens'])
updated_data = updated_data['tokens'].apply(lambda x: list(tokenize(x)))

In [104]:
print(type(updated_data))
updated_data.head()

<class 'pandas.core.series.Series'>


0                       [road]
1                           []
2       [single, travel, past]
3         [shop, dragon, mart]
4    [hilarious, spring, souk]
Name: tokens, dtype: object

In [109]:
#convert ['tokens'] column to list of strings for word2vec model
updated_corpus = updated_data.tolist()#.apply(lambda token: ' '.join(token)).tolist()

updated_corpus[:5]

[['road'],
 [],
 ['single', 'travel', 'past'],
 ['shop', 'dragon', 'mart'],
 ['hilarious', 'spring', 'souk']]

In [107]:
#start_time = time.time() #track start time of execution

#code from codebasics and semicolon, both on youtube

#instantiate word2vec model
word2vec2 = Word2Vec (window = 5, min_count = 2, workers = 4)
   
#build a vocabulary
word2vec2.build_vocab(corpus, progress_per = 1000)



In [108]:
#number of sentences in vocabulary
word2vec2.corpus_count

99186

In [111]:
#train the word2vec model

word2vec2.train(
                updated_corpus,
                total_examples = word2vec2.corpus_count,
                epochs = word2vec2.epochs
)

(4198495, 4198495)

In [157]:
word2vec2.wv.most_similar ('lulu')

[('hypermarket', 0.9551829099655151),
 ('waitrose', 0.9474326968193054),
 ('nesto', 0.9464415907859802),
 ('spinney', 0.9345912933349609),
 ('carrefour', 0.9311429858207703),
 ('spinneys', 0.919035792350769),
 ('viva', 0.9174704551696777),
 ('cafeteria', 0.9123212099075317),
 ('mcdonalds', 0.9016396999359131),
 ('kg', 0.8944598436355591)]

## **Word2Vec with bigrams**

In [123]:
#first model on full corpus

#code from gensim documentation
from gensim.models.phrases import Phrases, 

#train bigram detector
bigram_transform = Phrases (corpus, min_count = 1) #default threshold is 10.0, fewer phrases will be created



Phrases<816198 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [179]:
#check performance of bigram transform model
new_sentence = corpus[330]
print(bigram_transform[new_sentence])

['anymore', 'new', 'labor_law', 'common', 'blatant', 'good', 'company', 'care', 'sale', 'promotional', 'bdm', 'post', 'hire', 'look', 'filter', 'photo', 'cv', 'rife', 'real_estate', 'industry', 'say', 'uae', 'young', 'country', 'learn', 'motivation', 'law', 'change', 'align', 'progressive', 'practice', 'ahead', 'way', 'nature', 'place', 'live']


In [191]:
#apply bi-gram transform on corpus for word2vec model

bigram_corpus = [bigram_transform[sentence] for sentence in corpus]

bigram_word2vec = Word2Vec (bigram_corpus,window = 5, min_count = 2, workers = 4)

In [222]:
bigram_word2vec.wv.most_similar('lgbt')

[('hatred', 0.9805039763450623),
 ('agenda', 0.9785802960395813),
 ('disrespect', 0.977561354637146),
 ('ignorance', 0.9768002033233643),
 ('prejudice', 0.9764350056648254),
 ('propaganda', 0.9760943055152893),
 ('lgbtq', 0.9748927354812622),
 ('homosexuality', 0.9736032485961914),
 ('sexual', 0.9727028608322144),
 ('extremist', 0.9723531007766724)]

In [199]:
#second model on filtered corpus --- additonal stopwords removed (informal conversation common words)

#train bigram detector
bigram_transform2 = Phrases (updated_corpus, min_count = 1)

In [200]:
#check performance of bigram transform model
new_sentence = corpus[330]
print(bigram_transform[new_sentence])

['anymore', 'new', 'labor_law', 'common', 'blatant', 'good', 'company', 'care', 'sale', 'promotional', 'bdm', 'post', 'hire', 'look', 'filter', 'photo', 'cv', 'rife', 'real_estate', 'industry', 'say', 'uae', 'young', 'country', 'learn', 'motivation', 'law', 'change', 'align', 'progressive', 'practice', 'ahead', 'way', 'nature', 'place', 'live']


In [201]:
#apply bi-gram transform on corpus for word2vec model

bigram_corpus2 = [bigram_transform2[sentence] for sentence in corpus]

bigram_word2vec2 = Word2Vec (bigram_corpus2,window = 5, min_count = 2, workers = 4)

In [221]:
bigram_word2vec2.wv.most_similar('lgbt')

[('brainwash', 0.984478771686554),
 ('lgbtq', 0.9838911890983582),
 ('tolerate', 0.9793130159378052),
 ('homosexuality', 0.978199303150177),
 ('uneducated', 0.9781699180603027),
 ('hypocrite', 0.9765817523002625),
 ('universe', 0.9762008786201477),
 ('hatred', 0.9754001498222351),
 ('intolerant', 0.9738975167274475),
 ('moron', 0.9735695719718933)]

## **CLUSTERING WITH THE WORD EMBEDDINGS**

## **CREATE DOCUMENT VECTORS FROM WORD EMBEDDINGS**