# **FEATURE EXTRACTION - BAG OF WORDS**

In [12]:
#import libraries
import pandas as pd
import numpy as np
import csv
import ast
import re
from prettytable import PrettyTable

import time
import random

#data visualization libraries
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.io as pio
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

#NLP & ML libraries
from gensim import corpora

from nltk import FreqDist

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [2]:
#set seed so that code output is deterministic
random.seed(0)  # Set the seed for Python's random module
np.random.seed(0)  # Set the seed for NumPy's random module

In [6]:
#import cleaned data

def list_converter(text):
    #to revert list->str conversion from pd.read_csv
    return ast.literal_eval(text)


data = pd.read_csv('../Data/training_data.csv', converters ={'tokens':list_converter})
data = data.drop(columns = ['index'])
data.head()

Unnamed: 0,text_type,ID,date_created,year,long_text,clean_text,tokens,word_count
0,comment,gtfou07,2021-04-05 13:13:23,2021,I am single and I have not traveled to any cun...,single travel past,"[single, travel, past]",3
1,comment,gtfrgpe,2021-04-05 13:56:09,2021,What happens when you shop at dragon mart...,shop dragon mart,"[shop, dragon, mart]",3
2,comment,gthiiwi,2021-04-05 23:18:56,2021,"That’s just absolutely hilarious, is this in t...",hilarious spring souk,"[hilarious, spring, souk]",3
3,comment,gtgfl4c,2021-04-05 18:21:42,2021,Is reel cinema and roxy part of emaar?,reel cinema roxy emaar,"[reel, cinema, roxy, emaar]",4
4,comment,gth5wdv,2021-04-05 21:42:41,2021,An innocent redditor here...can someone pls ex...,innocent pls explain everyday,"[innocent, pls, explain, everyday]",4


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65987 entries, 0 to 65986
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text_type     65987 non-null  object
 1   ID            65987 non-null  object
 2   date_created  65987 non-null  object
 3   year          65987 non-null  int64 
 4   long_text     65987 non-null  object
 5   clean_text    65987 non-null  object
 6   tokens        65987 non-null  object
 7   word_count    65987 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 4.0+ MB


In [11]:
#convert df['tokens'] to list of strings for bag-of-words model
docs = data['tokens'].apply(lambda token: ' '.join(token)).tolist()

docs[0:5]

['single travel past',
 'shop dragon mart',
 'hilarious spring souk',
 'reel cinema roxy emaar',
 'innocent pls explain everyday']

In [13]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2023-08-10 03:02:03,189 : INFO : collecting all words and their counts
2023-08-10 03:02:03,192 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2023-08-10 03:02:03,923 : INFO : PROGRESS: at sentence #10000, processed 645093 words and 606 word types
2023-08-10 03:02:04,652 : INFO : PROGRESS: at sentence #20000, processed 1290938 words and 619 word types
2023-08-10 03:02:05,364 : INFO : PROGRESS: at sentence #30000, processed 1915954 words and 620 word types
2023-08-10 03:02:06,089 : INFO : PROGRESS: at sentence #40000, processed 2570254 words and 621 word types
2023-08-10 03:02:06,815 : INFO : PROGRESS: at sentence #50000, processed 3217436 words and 621 word types
2023-08-10 03:02:07,512 : INFO : PROGRESS: at sentence #60000, processed 3840543 words and 622 word types
2023-08-10 03:02:08,163 : INFO : collected 622 token types (unigram + bigrams) from a corpus of 4421536 words and 65987 sentences
2023-08-10 03:02:08,164 : INFO : merged Phrases<622 vocab, min_count=2

AttributeError: 'str' object has no attribute 'append'

In [None]:
#from gensim bag of words documentation page

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
#dictionary.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))