-
Notifications
You must be signed in to change notification settings - Fork 3
/
pos_neg_data.py
123 lines (81 loc) · 3.1 KB
/
pos_neg_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# coding: utf-8
# - Lexicon - Dictionary of words
# - Bag of words - Every word has an ID (Index) (One-Hot array)
# - Here input vectors have to be of same length
# - Stemming - (Not necessarily a real Word)
# - Lematizing - (Is a Real Word)
# #### NLP Libraries
# In[7]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer
# In[8]:
lemmatizer = WordNetLemmatizer()
hm_lines = 100000
# #### Pre-Processing
# In[9]:
def create_lexicon(pos,neg):
lexicon = []
with open(pos,'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
with open(neg,'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
l2 = []
for w in w_counts:
#print(w_counts[w])
if 1000 > w_counts[w] > 50:
l2.append(w)
print(len(l2))
return l2
# In[10]:
def sample_handling(sample,lexicon,classification):
featureset = []
with open(sample,'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
features = list(features)
featureset.append([features,classification])
return featureset
# #### Train-Test-Split
# In[11]:
def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
lexicon = create_lexicon(pos,neg)
features = []
features += sample_handling('pos.txt',lexicon,[1,0])
features += sample_handling('neg.txt',lexicon,[0,1])
random.shuffle(features)
features = np.array(features)
testing_size = int(test_size*len(features))
train_x = list(features[:,0][:-testing_size])
train_y = list(features[:,1][:-testing_size])
test_x = list(features[:,0][-testing_size:])
test_y = list(features[:,1][-testing_size:])
return train_x,train_y,test_x,test_y
# #### Return length of Lexicon
# If the Dataset is too large you can pickle it and store it so that you don't have to do it again
# In[12]:
if __name__ == '__main__':
train_x,train_y,test_x,test_y = create_feature_sets_and_labels('/Users/gaddamnitish/Desktop/TensorFlow/SentDex/pos.txt','/Users/gaddamnitish/Desktop/TensorFlow/SentDex/neg.txt')
# if you want to pickle this data:
with open('/Users/gaddamnitish/Desktop/TensorFlow/SentDex/sentiment_set.pickle','wb') as f:
pickle.dump([train_x,train_y,test_x,test_y],f)
# Basically means when we are sending Data to the Neural Network, the length of every string of ours is 423 characters