-
Notifications
You must be signed in to change notification settings - Fork 0
/
generating_keywords.py
87 lines (61 loc) · 2.11 KB
/
generating_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import nltk
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer,PorterStemmer,LancasterStemmer,SnowballStemmer
from collections import OrderedDict
import semanticsimilarity
simobject=semanticsimilarity.semsim()
# Taking in the file content...........
f=open("q1","r")
file_content=f.read()
f.close()
# print(file_content)
# Taking in the file content...........
# Collecting word list........
word_list=nltk.word_tokenize(file_content)
# Collecting word list........
#POS tagging ...............
tagged_word_list=nltk.pos_tag(word_list)
#POS tagging ...............
# Removing stop words..........
stopwords=set(stopwords.words("english"))
filtered_word_list=[]
for i in tagged_word_list:
if i[0] not in stopwords:
filtered_word_list.append(i)
# Removing stop words..........
#stemming word........
stemmed_word_list=[]
for i in filtered_word_list:
if(i[0][len(i[0])-2:]=='ly'):
k=LancasterStemmer().stem(i[0])
if (simobject.givesim(k,i[0])>=0.6):
stemmed_word_list.append(i)
else :
stemmed_word_list.append(i)
elif (i[0][len(i[0])-1]!='e'):
k=PorterStemmer().stem(i[0])
if (simobject.givesim(k,i[0])>=0.6):
stemmed_word_list.append(i)
else :
stemmed_word_list.append(i)
else:
stemmed_word_list.append(i)
#stemming word........
#Lemmatising word list..............
lemmatizer=WordNetLemmatizer()
lemmatized_word_list=[]
for i in stemmed_word_list:
k=lemmatizer.lemmatize(i[0])
if (simobject.givesim(k,i[0])>=0.6):
lemmatized_word_list.append(i)
else :
lemmatized_word_list.append(i)
#Lemmatising word list.............
# Finally keeping only essentiall POS.............
final_processed_word_list=[]
for i in tagged_word_list:
if(i[1]=='CD' or i[1]=='FW' or i[1]=='NN' or i[1]=='NNS' or i[1]=='NNP' or i[1]=='NNPS' or i[1]=='JJ'):
final_processed_word_list.append(i[0])
# Finally keeping only essentiall POS.............
final_processed_word_list=list(OrderedDict.fromkeys(final_processed_word_list))
print(final_processed_word_list)