In [1]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import random
import pickle
from collections import Counter
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
hm_lines = 100000

In [5]:
def create_lexicon(pos, neg):
    lexicon = []
    for file in [pos,neg]:
        with open(file, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                all_worlds = word_tokenize(l.lower())
                lexicon += list(all_worlds)
                
    lexicon  = [lemmatizer.lemmatize(i) for i in lexicon]
    w_counts = Counter(lexicon) # create dict with each word and occurences
    
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            # we don't want words that are too common, just meaningful ones
            l2.append(w)
            
    return l2

In [12]:
def sample_handling(sample, lexicon, classification):
    featureset = []
    
    with open(sample, 'r') as f:
        contents = f.readlines()
        for i in contents[:hm_lines]:
            current_words = word_tokenize(i.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] +=1
                    
            features = list(features)
            featureset.append([features, classification])
    return featureset

In [13]:
def create_feature_sets_and_labels(pos, neg, test_size=0.1):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling('data/pos.txt', lexicon, [1,0])
    features += sample_handling('data/neg.txt', lexicon, [0,1])
    random.shuffle(features)
    features = np.array(features)
    
    testing_size = int(test_size*len(features))
    
    train_x = list(features[:,0][:-testing_size])
    test_x = list(features[:,1][:-testing_size])
    
    train_y = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    
    return train_x, train_y, test_x, test_y

if __name__ == '__main__':
    train_x, train_y, test_x, test_y = create_feature_sets_and_labels('data/pos.txt', 'data/neg.txt')
    with open('data/sentiment.pickle', 'wb') as f:
        pickle.dump([train_x, train_y, test_x, test_y], f)