In [41]:
import nltk
import random
import pickle

import numpy as np

from nltk import word_tokenize
from nltk import WordNetLemmatizer
from collections import Counter

In [32]:
lemmatizer = WordNetLemmatizer()

In [33]:
word_freq_ub = 1000
word_freq_lb = 50

pos_file = 'pos.txt'
neg_file = 'neg.txt'

In [27]:
def create_lexicon(pos, neg):
    lexicon = []
        
    for fname in [pos, neg]:
        with open(pos) as fp:
            lines = fp.readlines()
        for line in lines:
            words = list(word_tokenize(line.lower()))
            for w in words:
                lexicon.append(w)

    w_counts = Counter([lemmatizer.lemmatize(w) for w in lexicon])

    lexicon = [w for w in w_counts if word_freq_ub > w_counts[w] > word_freq_lb]

    return lexicon

In [28]:
def sample_handling(sample, lexicon, classification):
    feature_set = []
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(w) for w in current_words]
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word in lexicon:
                    index = lexicon.index(word)
                    features[index] += 1
            features = list(features)
            feature_set.append([features, classification])
    return feature_set

In [39]:
def create_feature_sets_and_labels(pos, neg, test_size=0.2):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling(pos, lexicon, [1, 0])
    features += sample_handling(neg, lexicon, [0, 1])
    random.shuffle(features)

    features = np.array(features)
    
    testing_size = int(test_size * len(features))
    train_x = list(features[:, 0][: -testing_size])
    train_y = list(features[:, 1][: -testing_size])

    test_x = list(features[:, 0][-testing_size :])
    test_y = list(features[:, 1][-testing_size :])
    
    return train_x, train_y, test_x, test_y

In [40]:
train_x, train_y, test_x, test_y = create_feature_sets_and_labels(pos_file, neg_file)

In [42]:
with open('sentiment_set.pickle', 'wb') as f:
    pickle.dump([train_x, train_y, test_x, test_y], f)


In [46]:
with open('sentiment_set.pickle', 'rb') as f:
    L = pickle.load(f)