In [63]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [10]:
#input_dir =  '/content/drive/My Drive/Colab Notebooks/Topics/Nature'
input_dir = r'C:\Users\sweta\OneDrive\Documents\GSU\Machine Learning\Final Proj\Text\Topic Analysis\Topic Analysis'

In [11]:
# Create a dataframe from the csv file.
im = pd.read_csv(input_dir + "/TA_Training_Set.csv")

In [12]:
im.columns

Index(['Comment', 'Topic'], dtype='object')

In [13]:
im.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Comment  900000 non-null  object
 1   Topic    900000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 13.7+ MB


In [14]:
im["Comment"] = im["Comment"].str.lower()

im.head()

Unnamed: 0,Comment,Topic
0,very hot and sexy nail color;),21
1,"i know, right? like, an fmea. i definitely kno...",28
2,"hello, we only allow people with an account ag...",5
3,megumin reads das kapital??? new best girl.,2
4,freedom of speech ? but you use a random anon ...,13


# **Setting up the data for binary classification**

In [17]:
# Cleaning [experiment with these choices downstream]
re_tokenizer = RegexpTokenizer("[\\w']+")
im['Comment'] = im['Comment'].astype(str)
# Tokenization using NLTK
im['text_nltk'] = im['Comment'].apply(re_tokenizer.tokenize)
im.head()


Unnamed: 0,Comment,Topic,text_nltk
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]"
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely..."
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc..."
3,megumin reads das kapital??? new best girl.,2,"[megumin, reads, das, kapital, new, best, girl]"
4,freedom of speech ? but you use a random anon ...,13,"[freedom, of, speech, but, you, use, a, random..."


In [18]:
# Lowercase
im['text_nltk'] = im['text_nltk'].apply(lambda x: [word.lower() for word in x])
im.head(3)

Unnamed: 0,Comment,Topic,text_nltk
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]"
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely..."
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc..."


In [23]:
# Dropping NLTK stop words
stopwords_nltk = set(nltk.corpus.stopwords.words('english'))
im['text_cleaned'] = im['text_nltk'].apply(lambda x: [word for word in x if word not in stopwords_nltk])

# Dropping punctuation and numbers:
punc = string.punctuation + '--' # Add characters as necessary 
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in punc])

digits = string.digits 
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in digits])
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if len(word)>1]) 

# from my understanding, words like "https", "'s" and "'" can also be dropped
additional = ["https", "'s", "’","'re", "amp", ".co", "co", "“", "”", "''", "``", "weâ€™re", "//t.co/"]
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
im['text_cleaned_string'] = [' '.join(map(str, l)) for l in im['text_cleaned']]

im['text_cleaned_string'] = [t.replace("//t.co/", "") for t in im['text_cleaned_string']]

im.head(3)

Unnamed: 0,Comment,Topic,text_nltk,text_cleaned,text_cleaned_string
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]","[hot, sexy, nail, color]",hot sexy nail color
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely...","[know, right, like, fmea, definitely, know, on...",know right like fmea definitely know one want ...
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc...","[hello, allow, people, account, age, days, 50,...",hello allow people account age days 50 comment...


In [24]:
# Additional cleaning:
from collections import Counter, defaultdict
# Count and list most frequent tokens
counter = Counter()
im['text_cleaned'].map(counter.update)
most_freq = pd.DataFrame(counter.most_common()) # Or add number within parens to list only x most common
most_freq.columns = ['token', 'count']

In [25]:
most_freq.head(20)

Unnamed: 0,token,count
0,like,128616
1,people,82484
2,would,80870
3,one,76515
4,get,72486
5,think,63561
6,com,59970
7,time,58380
8,reddit,52472
9,i'm,52174


In [26]:
pd.set_option('display.max_rows', None)

In [27]:
most_freq.head(100)

Unnamed: 0,token,count
0,like,128616
1,people,82484
2,would,80870
3,one,76515
4,get,72486
5,think,63561
6,com,59970
7,time,58380
8,reddit,52472
9,i'm,52174


In [28]:
additional = ["www", "com","gt","i'm’", "he's"]
im['text_cleaned'] = im['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
im['text_cleaned_string'] = [' '.join(map(str, l)) for l in im['text_cleaned']]

im['text_cleaned_string'] = [t.replace("//t.co/", "") for t in im['text_cleaned_string']]

im.head(3)

Unnamed: 0,Comment,Topic,text_nltk,text_cleaned,text_cleaned_string
0,very hot and sexy nail color;),21,"[very, hot, and, sexy, nail, color]","[hot, sexy, nail, color]",hot sexy nail color
1,"i know, right? like, an fmea. i definitely kno...",28,"[i, know, right, like, an, fmea, i, definitely...","[know, right, like, fmea, definitely, know, on...",know right like fmea definitely know one want ...
2,"hello, we only allow people with an account ag...",5,"[hello, we, only, allow, people, with, an, acc...","[hello, allow, people, account, age, days, 50,...",hello allow people account age days 50 comment...


In [29]:
# Look at class balance/imbalance
im['Topic'].value_counts()

20    22572
28    22570
6     22564
15    22550
18    22548
1     22539
33    22533
25    22533
10    22530
29    22527
24    22523
11    22521
27    22519
12    22515
3     22513
19    22513
30    22510
32    22503
4     22503
17    22497
7     22492
8     22492
21    22491
38    22491
22    22486
36    22486
31    22483
35    22483
39    22482
9     22482
14    22469
2     22468
37    22464
23    22460
34    22458
16    22453
26    22452
13    22450
5     22439
40    22436
Name: Topic, dtype: int64

In [30]:
# how to deal with data imbalance?

## **for customer focused column:**

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(im['text_cleaned'], 
                                                    im['Topic'],
                                                    test_size = 0.2)

In [34]:
# Convert each X list item to strings for the next step
X_train_str = [' '.join(map(str, l)) for l in X_train]
X_test_str = [' '.join(map(str, l)) for l in X_test]

In [37]:
# Create TF-IDF vectors
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
X_train_tfidf = vectorizer.fit_transform(X_train_str)
X_test_tfidf = vectorizer.transform(X_test_str)

In [38]:
# Look at this object -- what does the note mean?
X_train_tfidf

<720000x4745748 sparse matrix of type '<class 'numpy.float64'>'
	with 19972992 stored elements in Compressed Sparse Row format>

# ** SVM**

In [41]:
svc = LinearSVC(random_state = 1, max_iter = 10000, C = 0.5)
svc.fit(X_train_tfidf, Y_train)
print("train score:",svc.score(X_train_tfidf, Y_train))
print("test score:",svc.score(X_test_tfidf, Y_test))

train score: 0.9674583333333333
test score: 0.49740555555555555


In [42]:
# Create a dataframe from the csv file.
testdata = pd.read_csv(input_dir + "/TestFileTemplate 1.csv")

In [43]:
testdata

Unnamed: 0,Comment,Topic
0,      ,4
1, *Another One Bites the Dust* ,4
2, Banana  haha                ...,5
3, It also didnâ€™t take that long for ...,5
4, JAGEX IS POWERLESS AGAINST PVP CLANS,3
5, noooooooo!!!! Traitor !!!!!! \n\nJk;) g...,6
6, so sad. First character I learned. He could ...,7
7, Stop crying ,7
8,\n\nI'm online. IGN is annejar \n\n\nI've add...,6
9,\nI donâ€™t know about those characters but [S...,7


In [44]:
# Cleaning [experiment with these choices downstream]
re_tokenizer = RegexpTokenizer("[\\w']+")
testdata['Comment'] = testdata['Comment'].astype(str)
# Tokenization using NLTK
testdata['text_nltk'] = testdata['Comment'].apply(re_tokenizer.tokenize)
testdata

Unnamed: 0,Comment,Topic,text_nltk
0,      ,4,[]
1, *Another One Bites the Dust* ,4,"[Another, One, Bites, the, Dust]"
2, Banana  haha                ...,5,"[Banana, haha, Nice, video, man]"
3, It also didnâ€™t take that long for ...,5,"[It, also, didnâ, t, take, that, long, for, yo..."
4, JAGEX IS POWERLESS AGAINST PVP CLANS,3,"[JAGEX, IS, POWERLESS, AGAINST, PVP, CLANS]"


In [45]:
# Lowercase
testdata['text_nltk'] = testdata['text_nltk'].apply(lambda x: [word.lower() for word in x])
testdata

Unnamed: 0,Comment,Topic,text_nltk
0,      ,4,[]
1, *Another One Bites the Dust* ,4,"[another, one, bites, the, dust]"
2, Banana  haha                ...,5,"[banana, haha, nice, video, man]"
3, It also didnâ€™t take that long for ...,5,"[it, also, didnâ, t, take, that, long, for, yo..."
4, JAGEX IS POWERLESS AGAINST PVP CLANS,3,"[jagex, is, powerless, against, pvp, clans]"
5, noooooooo!!!! Traitor !!!!!! \n\nJk;) g...,6,"[noooooooo, traitor, jk, glad, you, did, what,..."
6, so sad. First character I learned. He could ...,7,"[so, sad, first, character, i, learned, he, co..."
7, Stop crying ,7,"[stop, crying]"
8,\n\nI'm online. IGN is annejar \n\n\nI've add...,6,"[i'm, online, ign, is, annejar, i've, added, you]"
9,\nI donâ€™t know about those characters but [S...,7,"[i, donâ, t, know, about, those, characters, b..."


In [47]:
# Dropping NLTK stop words
stopwords_nltk = set(nltk.corpus.stopwords.words('english'))
testdata['text_cleaned'] = testdata['text_nltk'].apply(lambda x: [word for word in x if word not in stopwords_nltk])

# Dropping punctuation and numbers:
punc = string.punctuation + '--' # Add characters as necessary 
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if word not in punc])

digits = string.digits 
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if word not in digits])
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if len(word)>1]) 

# from my understanding, words like "https", "'s" and "'" can also be dropped
additional = ["https", "'s", "’","'re", "amp", ".co", "co", "“", "”", "''", "``", "weâ€™re", "//t.co/"]
testdata['text_cleaned'] = testdata['text_cleaned'].apply(lambda x: [word for word in x if word not in additional])
testdata['text_cleaned_string'] = [' '.join(map(str, l)) for l in testdata['text_cleaned']]

testdata['text_cleaned_string'] = [t.replace("//t.co/", "") for t in testdata['text_cleaned_string']]

testdata

Unnamed: 0,Comment,Topic,text_nltk,text_cleaned,text_cleaned_string
0,      ,4,[],[],
1, *Another One Bites the Dust* ,4,"[another, one, bites, the, dust]","[another, one, bites, dust]",another one bites dust
2, Banana  haha                ...,5,"[banana, haha, nice, video, man]","[banana, haha, nice, video, man]",banana haha nice video man
3, It also didnâ€™t take that long for ...,5,"[it, also, didnâ, t, take, that, long, for, yo...","[also, didnâ, take, long, reply]",also didnâ take long reply
4, JAGEX IS POWERLESS AGAINST PVP CLANS,3,"[jagex, is, powerless, against, pvp, clans]","[jagex, powerless, pvp, clans]",jagex powerless pvp clans
5, noooooooo!!!! Traitor !!!!!! \n\nJk;) g...,6,"[noooooooo, traitor, jk, glad, you, did, what,...","[noooooooo, traitor, jk, glad, ammo, type, use]",noooooooo traitor jk glad ammo type use
6, so sad. First character I learned. He could ...,7,"[so, sad, first, character, i, learned, he, co...","[sad, first, character, learned, could, much, ...",sad first character learned could much better ...
7, Stop crying ,7,"[stop, crying]","[stop, crying]",stop crying
8,\n\nI'm online. IGN is annejar \n\n\nI've add...,6,"[i'm, online, ign, is, annejar, i've, added, you]","[i'm, online, ign, annejar, i've, added]",i'm online ign annejar i've added
9,\nI donâ€™t know about those characters but [S...,7,"[i, donâ, t, know, about, those, characters, b...","[donâ, know, characters, scorpion, www, instag...",donâ know characters scorpion www instagram co...


In [51]:
X_test1 = testdata['text_cleaned']

In [52]:
X_test1_str = [' '.join(map(str, l)) for l in X_test1]

In [53]:
X_test1_tfidf = vectorizer.transform(X_test1_str)

In [54]:
y_pred =  svc.predict(X_test1_tfidf)

In [55]:
y_pred

array([ 8, 13,  5,  5, 18,  6,  7, 40,  6,  7,  1, 12,  7,  7, 14, 25, 31,
       12, 12, 12, 12, 17, 12, 40, 21,  8, 36, 32, 35, 20, 28, 13,  2, 32,
       12, 40, 14, 15, 12, 19,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6,  6,  6, 12, 19, 12, 25, 12, 16, 12, 12, 12,  2,  1, 39,
       16, 26, 24, 23, 12, 36, 23,  5,  9, 12, 27,  7, 12, 21, 31, 12, 25,
       18, 12, 12, 15, 30, 17, 31, 12, 27,  3, 12, 26, 12, 12, 11, 40,  4,
       28, 37, 12,  9, 12, 22, 40, 40, 40, 12, 12, 12, 12, 12,  1, 19, 12,
       23, 21, 12,  4,  4, 17, 23, 12, 40, 22, 40, 35, 22, 12, 10, 12, 14,
       15, 14,  1, 12, 30,  9, 12, 35,  8,  8,  8, 39,  8,  8, 12, 12, 39,
       17, 24, 12,  9, 12, 37, 12, 35, 12, 32, 40, 29, 12, 14, 12, 40, 12,
       12,  2, 12, 12, 32, 12, 26, 12,  2, 40, 23, 39, 12, 33, 12, 11, 36,
       12, 12, 12, 23, 28, 12,  4,  1, 12, 16, 24, 15, 36, 33, 12, 12, 12,
       11, 35, 12, 37,  4, 12, 12, 19, 34, 12, 17,  9, 11, 33, 25,  9, 12,
       36, 12, 12, 16, 40

In [58]:
testdata['PredictTopic'] = y_pred

In [61]:
testdata[['Comment','Topic','PredictTopic']]

Unnamed: 0,Comment,Topic,PredictTopic
0,      ,4,8
1, *Another One Bites the Dust* ,4,13
2, Banana  haha                ...,5,5
3, It also didnâ€™t take that long for ...,5,5
4, JAGEX IS POWERLESS AGAINST PVP CLANS,3,18
5, noooooooo!!!! Traitor !!!!!! \n\nJk;) g...,6,6
6, so sad. First character I learned. He could ...,7,7
7, Stop crying ,7,40
8,\n\nI'm online. IGN is annejar \n\n\nI've add...,6,6
9,\nI donâ€™t know about those characters but [S...,7,7


In [66]:
acc = accuracy_score(testdata['Topic'], testdata['PredictTopic'])
print(acc)

0.8588235294117647
