# Visualising word cloud

In [None]:
!pip -q install yellowbrick==1.3
!pip -q install kneed
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install plotly
!pip install squarify
!pip install chart_studio
!pip -q install factor_analyzer
! pip install distance
!pip install sentence-transformers

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway
import statsmodels.api as sm
from statsmodels.formula.api import ols
import plotly
import plotly.express as px
import plotly.io as pio
import datetime
import squarify

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Pre-processing data
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import functools
from functools import lru_cache
from bs4 import BeautifulSoup
import re

# Word embedding model Word2Vec
import gensim
import gensim.downloader as gensim_api
from gensim.models.word2vec import Word2Vec
from sklearn.decomposition import PCA

# Bag of words TD-IDF
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing

# Advanced feature extraction
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# import data
from google.colab import files
import openpyxl

pio.renderers.default = 'colab' 
import distance

from sentence_transformers import SentenceTransformer, util
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
cleaned_features = pd.read_csv("/content/gdrive/MyDrive/CS3244 Machine Learning/Data/cleaned_features.csv")

In [None]:
cleaned_features.head()

In [None]:
subsample = cleaned_features.sample(n=100000, random_state = 3244)
subsample[["is_duplicate"]].value_counts()

In [None]:
word_dict1 = {}
for i in subsample["question1_cleaned"]:
  word_array = i.split()
  for j in word_array:
    if j not in word_dict1.keys():
      word_dict1[j] = 1
    else:
      word_dict1[j] += 1
#sorted(word_dict1.items(), key=lambda x: x[1], reverse = True)[:20]
sorted(word_dict1.items(), key=lambda x: x[1])

In [None]:
word_dict2 = {}
for i in subsample["question2_cleaned"]:
  word_array = i.split()
  for j in word_array:
    if j not in word_dict2.keys():
      word_dict2[j] = 1
    else:
      word_dict2[j] += 1
sorted(word_dict2.items(), key=lambda x: x[1], reverse = True)[:20]

In [None]:
# look at the bottom few words
sorted(word_dict1.items(), key=lambda x: x[1])[:20]

In [None]:
stopwords_lemmatize = pd.read_csv("/content/gdrive/MyDrive/CS3244 Machine Learning/Data/stopwords_lemmatize_features.csv")
stopwords_lemmatize.head()

In [None]:
subsample_stopwords_lemmatize = stopwords_lemmatize.sample(n=100000, random_state = 3244)

In [None]:
# visualise the word cloud
from wordcloud import WordCloud
long_string1 = ','.join(list(subsample_stopwords_lemmatize['question1_stopwords_lemmatize'].values))
long_string1
wordcloud1 = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', random_state = 3244)

wordcloud1.generate(long_string1)
wordcloud1.to_image()

In [None]:
long_string2 = ','.join(list(subsample_stopwords_lemmatize['question2_stopwords_lemmatize'].values))
long_string2
wordcloud2 = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', random_state = 3244)

wordcloud2.generate(long_string2)
wordcloud2.to_image()

In [None]:
wordcloud1_remove_collocations = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', random_state = 3244, collocations = False)

wordcloud1_remove_collocations.generate(long_string1)
wordcloud1_remove_collocations.to_image()

In [None]:
wordcloud2_remove_collocations = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue', random_state = 3244, collocations = False)

wordcloud2_remove_collocations.generate(long_string2)
wordcloud2_remove_collocations.to_image()

In [None]:
wordcloud1_df = pd.DataFrame(wordcloud1_remove_collocations.words_.items())
# wordcloud1_df.head(10)
wordcloud1_df.tail(10)

In [None]:
wordcloud2_df = pd.DataFrame(wordcloud2_remove_collocations.words_.items())
# wordcloud2_df.head(10)
wordcloud2_df.tail(10)

# Topic Modelling

In [None]:
from gensim import corpora, models

In [None]:
list_of_list_of_tokens = []
for i in subsample_stopwords_lemmatize["question1_stopwords_lemmatize"]:
  list_of_list_of_tokens.append(i.split())

dictionary_LDA = corpora.Dictionary(list_of_list_of_tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens]
num_topics = 20

lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

In [None]:
list_of_list_of_tokens2 = []
for i in subsample_stopwords_lemmatize["question2_stopwords_lemmatize"]:
  list_of_list_of_tokens2.append(i.split())

dictionary_LDA2 = corpora.Dictionary(list_of_list_of_tokens2)
dictionary_LDA2.filter_extremes(no_below=3)
corpus2 = [dictionary_LDA2.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens2]

lda_model2 = models.LdaModel(corpus2, num_topics=num_topics, \
                                  id2word=dictionary_LDA2, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA2.keys()))

for i,topic in lda_model2.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

In [None]:
# corpus[0] means the first document
lda_model[corpus[0]] 

subsample_stopwords_lemmatize["question1_stopwords_lemmatize"].iloc[0]
test = 'safety precaution handle shotgun propose nra massachisetts'.split()
lda_model[dictionary_LDA.doc2bow(test)]

In [None]:
subsample_stopwords_lemmatize["question2_stopwords_lemmatize"].iloc[0]

In [None]:
lda_model[corpus[1]]