In [93]:
#import libraries
import pandas as pd
import numpy as np
import csv
import ast
import re
from prettytable import PrettyTable

import time

#spelling correction
import enchant
from spellchecker import SpellChecker
from autocorrect import Speller
from textblob import TextBlob


#data visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.io as pio
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#NLP & ML libraries
from nltk import FreqDist
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
pd.reset_option('display.max_colwidth')
pd.reset_option ('display.max_column')

#pd.set_option('display.max_colwidth', None)

In [3]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('Data/training_corpus.csv', converters ={'tokens':list_converter})

In [42]:
data = data.drop(columns = ['index'])
data

Unnamed: 0,text_type,ID,year,long_text,clean_text,word_count,tokens
0,comment,gtfo2hl,2021,"*Cuntry roads, take me hoem*",cuntry roads hoem,3,"[cuntry, road, hoem]"
1,comment,gtfqkbv,2021,"That’s been there for several years, sent a pi...",years sent pic cuntry friend long time ago,8,"[year, send, pic, cuntry, friend, long, time, ..."
2,comment,gtfou07,2021,I am single and I have not traveled to any cun...,single traveled cuntry past year,5,"[single, travel, cuntry, past, year]"
3,comment,gtfrgpe,2021,What happens when you shop at dragon mart...,happens shop dragon mart,4,"[happen, shop, dragon, mart]"
4,comment,gthiiwi,2021,"That’s just absolutely hilarious, is this in t...",absolutely hilarious springs souk,4,"[absolutely, hilarious, spring, souk]"
...,...,...,...,...,...,...,...
99181,submission,14f46ji,2023,"Best beauty saloons in Dubai? Hello fellas, I ...",best beauty saloons dubai hello fellas moved w...,35,"[good, beauty, saloon, dubai, hello, fellas, m..."
99182,submission,14f4uyi,2023,Found the r/dubai redditors who kept telling m...,found r dubai redditors kept telling know navi...,10,"[find, r, dubai, redditor, keep, tell, know, n..."
99183,submission,14f4ri3,2023,Scam ? Healthy.line My sister has a CBD debit ...,scam healthy line sister cbd debit card month ...,47,"[scam, healthy, line, sister, cbd, debit, card..."
99184,submission,14f4k3r,2023,Thoughts on Expo City properties? Anyone else ...,thoughts expo city properties checked expo cit...,21,"[thought, expo, city, property, check, expo, c..."


## **Bag of Words Model** (from Sklearn)

In [95]:
#create instance of CountVectorizer
cv = CountVectorizer()

#convert list of tokenized words to strings
input_data = data['tokens'].apply(lambda token: ' '.join(token))

#create matrix of word vectors
X_bow = cv.fit_transform(input_data)

print ("The program took %.3f seconds to complete. The ngram representation had %i features." % (time.time() - start_time, X_bow.shape[1]))

The program took 42.788 seconds to complete. The ngram representation had 39810 features.


In [83]:
#get feature names
bow_features = cv.get_feature_names()

In [84]:
#tabular presentation of a sample of the bag of words representation
text_df = data['clean_text'].loc[8000:8004].copy()
bow_df = pd.DataFrame(X_bow[8000:8005].toarray(), columns = bow_features)

text_bow = bow_df.copy()
text_bow = text_bow.loc[:, (text_bow == 1).any()]

text_bow.insert(0, 'clean_text', text_df.values)

text_bow

Unnamed: 0,clean_text,agent,asian,asset,brodsky,buy,car,center,check,city,...,thing,time,town,trade,uni,village,visit,want,world,year
0,dubai world trade center expo site rest taken ...,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
1,uni enjoyed felt brodsky comopolitan like town...,0,0,0,1,0,0,0,0,1,...,0,0,1,0,1,0,0,0,0,0
2,visit aus fri aus,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,kind intrigued feel reluctant time car maybe y...,0,1,0,0,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,1
4,real estate agent rereading sounds pro real es...,1,0,1,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


## **N-Grams Model** with Sklearn's CountVectorizer

### **Bi-Grams**

In [94]:
start_time = time.time()
# create bi_gram instance of CountVectorizer
bi_cv = CountVectorizer(analyzer = 'word', ngram_range = (2,2))


#create matrix of word vectors
X_bigram = bi_cv.fit_transform(input_data)

print ("The program took %.3f seconds to complete. The ngram representation had %i features." % (time.time() - start_time, X_bigram.shape[1]))

The program took 5.903 seconds to complete. The ngram representation had 771375 features.


In [89]:
#get feature names
bigram_features = bi_cv.get_feature_names()

#tabular presentation of a sample of the bag of words representation

bigram_df = pd.DataFrame(X_bigram[8000:8005].toarray(), columns = bigram_features)

text_bigram = bigram_df.copy()
text_bigram = text_bigram.loc[:, (text_bigram == 1).any()]

text_bigram.insert(0, 'clean_text', text_df.values)

text_bigram

Unnamed: 0,clean_text,agent rereading,asian village,asset price,aus fri,brodsky comopolitan,buy peak,car maybe,center expo,chance repricing,...,time car,time stay,town city,trade center,uni enjoy,village goody,visit aus,want check,world trade,year want
0,dubai world trade center expo site rest taken ...,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
1,uni enjoyed felt brodsky comopolitan like town...,0,0,0,0,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,visit aus fri aus,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,kind intrigued feel reluctant time car maybe y...,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,1,0,1
4,real estate agent rereading sounds pro real es...,1,0,1,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0


### **Tri-Grams**