In [28]:
import pandas as pd
from nltk.tokenize import word_tokenize, RegexpTokenizer
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [2]:
#input_dir =  '/content/drive/My Drive/Colab Notebooks/Topics/Nature'
input_dir = r'C:\Users\sweta\OneDrive\Documents\GSU\Machine Learning\Final Proj\Text\Topic Analysis\Topic Analysis'

In [3]:
# Create a dataframe from the csv file.
im = pd.read_csv(input_dir + "/TA_Training_Set.csv")

In [4]:
im.columns

Index(['Comment', 'Topic'], dtype='object')

In [5]:
im.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Comment  900000 non-null  object
 1   Topic    900000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 13.7+ MB


In [6]:
im["Comment"] = im["Comment"].str.lower()

im.head()

Unnamed: 0,Comment,Topic
0,very hot and sexy nail color;),21
1,"i know, right? like, an fmea. i definitely kno...",28
2,"hello, we only allow people with an account ag...",5
3,megumin reads das kapital??? new best girl.,2
4,freedom of speech ? but you use a random anon ...,13


# **Setting up the data **

In [9]:
# Cleaning [experiment with these choices downstream]
re_tokenizer = RegexpTokenizer("[\\w']+")
im['Comment'] = im['Comment'].astype(str)
# Tokenization using NLTK
im['text_nltk'] = im['Comment'].apply(re_tokenizer.tokenize)
im.head()


Unnamed: 0,Comment,Topic,text_nltk
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]"
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely..."
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc..."
3,megumin reads das kapital??? new best girl.,2,"[megumin, reads, das, kapital, new, best, girl]"
4,freedom of speech ? but you use a random anon ...,13,"[freedom, of, speech, but, you, use, a, random..."


In [12]:
# Dropping NLTK stop words
#stopwords_nltk = set(nltk.corpus.stopwords.words('english'))

#im['text_cleaned'] = im['text_nltk'].apply(lambda x: [word for word in x if word not in stopwords_nltk])

# Dropping punctuation and numbers:
punc = string.punctuation + '--' # Add characters as necessary 
im['text_cleaned'] = im['text_nltk'].apply(lambda x: [word for word in x if word not in punc])

digits = string.digits 
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in digits])
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if len(word)>1]) 

# from my understanding, words like "https", "'s" and "'" can also be dropped
additional = ["https", "'s", "’","'re", "amp", ".co", "co", "“", "”", "''", "``", "weâ€™re", "//t.co/"]
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
im['text_cleaned_string'] = [' '.join(map(str, l)) for l in im['text_cleaned']]

im['text_cleaned_string'] = [t.replace("//t.co/", "") for t in im['text_cleaned_string']]

im.head()

<bound method NDFrame.head of                                                   Comment  Topic  \
0                          very hot and sexy nail color;)     21   
1       i know, right? like, an fmea. i definitely kno...     28   
2       hello, we only allow people with an account ag...      5   
3             megumin reads das kapital??? new best girl.      2   
4       freedom of speech ? but you use a random anon ...     13   
...                                                   ...    ...   
899995  for many people with asd, this is not true. as...      1   
899996  hes too brashly for someone in crusading dista...     25   
899997  but if you replace the word “man” with “ conse...     36   
899998  you can sit down in the cubicle have a browse ...     37   
899999  some fried potatoes for you, m'lady. no wonder...     10   

                                                text_nltk  \
0                     [very, hot, and, sexy, nail, color]   
1       [i, know, right, like, 

In [13]:
# Additional cleaning:
from collections import Counter, defaultdict
# Count and list most frequent tokens
counter = Counter()
im['text_cleaned'].map(counter.update)
most_freq = pd.DataFrame(counter.most_common()) # Or add number within parens to list only x most common
most_freq.columns = ['token', 'count']


In [14]:
most_freq.head(20)

Unnamed: 0,token,count
0,the,1026198
1,to,690927
2,and,554732
3,you,433364
4,of,421881
5,it,366304
6,that,358075
7,is,353225
8,in,330552
9,this,265859


In [15]:
additional = []
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
im['text_cleaned_string'] = [' '.join(map(str, l)) for l in im['text_cleaned']]

im['text_cleaned_string'] = [t.replace("//t.co/", "") for t in im['text_cleaned_string']]

im.head(3)

Unnamed: 0,Comment,Topic,text_nltk,text_cleaned,text_cleaned_string
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]","[very, hot, and, sexy, nail, color]",very hot and sexy nail color
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely...","[know, right, like, an, fmea, definitely, know...",know right like an fmea definitely know what o...
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc...","[hello, we, only, allow, people, with, an, acc...",hello we only allow people with an account age...


In [16]:
# Look at class balance/imbalance
im['Topic'].value_counts()

20    22572
28    22570
6     22564
15    22550
18    22548
1     22539
33    22533
25    22533
10    22530
29    22527
24    22523
11    22521
27    22519
12    22515
3     22513
19    22513
30    22510
32    22503
4     22503
17    22497
7     22492
8     22492
21    22491
38    22491
22    22486
36    22486
31    22483
35    22483
39    22482
9     22482
14    22469
2     22468
37    22464
23    22460
34    22458
16    22453
26    22452
13    22450
5     22439
40    22436
Name: Topic, dtype: int64

In [17]:
im.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 5 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Comment              900000 non-null  object
 1   Topic                900000 non-null  int64 
 2   text_nltk            900000 non-null  object
 3   text_cleaned         900000 non-null  object
 4   text_cleaned_string  900000 non-null  object
dtypes: int64(1), object(4)
memory usage: 34.3+ MB


In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(im['text_cleaned'], 
                                                    im['Topic'],
                                                    test_size = 0.2)

In [24]:
# Convert each X list item to strings for the next step
X_train_str = [' '.join(map(str, l)) for l in X_train]
X_test_str = [' '.join(map(str, l)) for l in X_test]

In [25]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
X_train_tfidf = vectorizer.fit_transform(X_train_str)
X_test_tfidf = vectorizer.transform(X_test_str)

In [26]:
# Look at this object -- what does the note mean?
X_train_tfidf

<720000x3720749 sparse matrix of type '<class 'numpy.float64'>'
	with 35265639 stored elements in Compressed Sparse Row format>

# ** SVM**

In [29]:
svc = LinearSVC(random_state = 1, max_iter = 10000, C = 0.5)
svc.fit(X_train_tfidf, Y_train)
print("train score:",svc.score(X_train_tfidf, Y_train))
print("test score:",svc.score(X_test_tfidf, Y_test))

train score: 0.9655847222222222
test score: 0.5053944444444445


In [30]:
# Create a dataframe from the csv file.
testdata = pd.read_csv(r'C:\Users\sweta\OneDrive\Documents\GSU\Machine Learning\Final Proj\Text\Topic Analysis\Topic Analysis Test File\Topic Analysis Test File\TA_Test_Set.csv')

In [31]:
testdata

Unnamed: 0,Comment
0,I bought a month and a half out on a stock tha...
1,"Parity used to be the justification, but that ..."
2,Yeah cartel. Legolas is gonna shoot your ass d...
3,"I do think he’s TA, but there’s one thing with..."
4,"Were trying, let you know if anything works"
...,...
99995,As I migraine sufferer I can tell you coffee i...
99996,If she was a drink she'd be room temperature t...
99997,I live on campus here. And was there when it h...
99998,bro nobody likes that shit


In [32]:
testdata["Comment"] = testdata["Comment"].str.lower()
testdata.head()

Unnamed: 0,Comment
0,i bought a month and a half out on a stock tha...
1,"parity used to be the justification, but that ..."
2,yeah cartel. legolas is gonna shoot your ass d...
3,"i do think he’s ta, but there’s one thing with..."
4,"were trying, let you know if anything works"


In [33]:
# Cleaning [experiment with these choices downstream]
re_tokenizer = RegexpTokenizer("[\\w']+")
testdata['Comment'] = testdata['Comment'].astype(str)
# Tokenization using NLTK
testdata['text_nltk'] = testdata['Comment'].apply(re_tokenizer.tokenize)
testdata.head()

Unnamed: 0,Comment,text_nltk
0,i bought a month and a half out on a stock tha...,"[i, bought, a, month, and, a, half, out, on, a..."
1,"parity used to be the justification, but that ...","[parity, used, to, be, the, justification, but..."
2,yeah cartel. legolas is gonna shoot your ass d...,"[yeah, cartel, legolas, is, gonna, shoot, your..."
3,"i do think he’s ta, but there’s one thing with...","[i, do, think, he, s, ta, but, there, s, one, ..."
4,"were trying, let you know if anything works","[were, trying, let, you, know, if, anything, w..."


In [35]:
# Dropping NLTK stop words
#stopwords_nltk = set(nltk.corpus.stopwords.words('english'))

#im['text_cleaned'] = im['text_nltk'].apply(lambda x: [word for word in x if word not in stopwords_nltk])

# Dropping punctuation and numbers:
punc = string.punctuation + '--' # Add characters as necessary 
testdata['text_cleaned'] = testdata['text_nltk'].apply(lambda x: [word for word in x if word not in punc])

digits = string.digits 
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if word not in digits])
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if len(word)>1]) 

# from my understanding, words like "https", "'s" and "'" can also be dropped
additional = ["https", "'s", "’","'re", "amp", ".co", "co", "“", "”", "''", "``", "weâ€™re", "//t.co/"]
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
testdata['text_cleaned_string'] = [' '.join(map(str, l)) for l in testdata['text_cleaned']]

testdata['text_cleaned_string'] = [t.replace("//t.co/", "") for t in testdata['text_cleaned_string']]

testdata.head()

Unnamed: 0,Comment,text_nltk,text_cleaned,text_cleaned_string
0,i bought a month and a half out on a stock tha...,"[i, bought, a, month, and, a, half, out, on, a...","[bought, month, and, half, out, on, stock, tha...",bought month and half out on stock that has al...
1,"parity used to be the justification, but that ...","[parity, used, to, be, the, justification, but...","[parity, used, to, be, the, justification, but...",parity used to be the justification but that w...
2,yeah cartel. legolas is gonna shoot your ass d...,"[yeah, cartel, legolas, is, gonna, shoot, your...","[yeah, cartel, legolas, is, gonna, shoot, your...",yeah cartel legolas is gonna shoot your ass do...
3,"i do think he’s ta, but there’s one thing with...","[i, do, think, he, s, ta, but, there, s, one, ...","[do, think, he, ta, but, there, one, thing, wi...",do think he ta but there one thing with what y...
4,"were trying, let you know if anything works","[were, trying, let, you, know, if, anything, w...","[were, trying, let, you, know, if, anything, w...",were trying let you know if anything works


In [36]:
X_train = im['text_cleaned']
Y_train = im['Topic']

In [37]:
X_test = testdata['text_cleaned']

In [38]:
# Convert each X list item to strings for the next step
X_train_str = [' '.join(map(str, l)) for l in X_train]
X_test_str = [' '.join(map(str, l)) for l in X_test]

In [39]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
X_train_tfidf = vectorizer.fit_transform(X_train_str)
X_test_tfidf = vectorizer.transform(X_test_str)

In [40]:
svc = LinearSVC(random_state = 1, max_iter = 10000, C = 0.5)
svc.fit(X_train_tfidf, Y_train)
print("train score:",svc.score(X_train_tfidf, Y_train))

train score: 0.9602977777777778


In [41]:
predicted = svc.predict(X_test_tfidf)

In [42]:
testdata['Topic'] = predicted

In [43]:
testdata.to_csv('T-Comments.csv')