-
Notifications
You must be signed in to change notification settings - Fork 1
/
NB_analyse.py
41 lines (30 loc) · 850 Bytes
/
NB_analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
data = pd.read_csv('NBdata.csv')
chat = {}
for index,row in data.iterrows():
if row.speaker not in chat:
chat[row.speaker] = [row.text]
else:
chat[row.speaker].append(row.text)
import nltk
def str_stemmer(s):
ss= ''.join([i if ord(i) < 128 else ' ' for i in s])
punc=".,/?;:[{]}\|><-_\"*+^#%&();0123456789@!`~="
exclude = set(punc)
ss.replace('\n', ' ')
temp = []
for ch in ss:
if ch in exclude:
temp.append(' ')
else:
temp.append(ch)
#s=''.join(ch for ch in ss if ch not in exclude)
s = ''.join(temp)
s=s.split()
s = [w.lower() for w in s]
out=" ".join(s)
return out
all_words =
words = nltk.word_tokenize(str_stemmer(" ".join(chat['Sanchayan'])))
result = nltk.FreqDist(words)
print result.most_common(50)