In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
from collections import Counter
import pickle
import sys
import tensorflow as tf
stdout = sys.stdout 
reload(sys)
sys.setdefaultencoding('ISO-8859-1')
sys.stdout = stdout

In [11]:
lemmatizer = WordNetLemmatizer()
lines = 10000

In [20]:
def create_lexicon(pos, neg):
    lexicon = []
    with open(pos, 'rb') as f:
        contents = f.readlines()
        for line in contents[:lines]:
            words_in_line = word_tokenize(line)
            lexicon+=words_in_line
        
    with open(neg,'r') as f:
        contents = f.readlines()
        for line in contents[:lines]:
            words_in_line = word_tokenize(line)
            lexicon+=words_in_line
            
    lemmatized_lexicon = [lemmatizer.lemmatize(word) for word in lexicon]
    count_dict = Counter(lemmatized_lexicon)
    lexicon = []
    for word in count_dict:
        if 50 < count_dict[word] < 1000:
            lexicon += word
    
    return lexicon

def get_feature_set(filename, lexicon, classification):
    feature_set = []
    with open(filename, 'r') as f:
        contents = f.readlines()
        for line in contents[:lines]:
            words = word_tokenize(line)
            lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
            features = np.zeros(len(lexicon))
            for word in lemmatized_words:
                try:
                    index_of_word = lexicon.index(word)
                    features[index_of_word] += 1
                except:
                    pass
        
            feature_set.append((features, classification))
        return feature_set
    
def create_input_data(pos_file, neg_file,test_size = 0.1):
    lexicon = create_lexicon(pos_file, neg_file)
    features = []
    features += get_feature_set(pos_file, lexicon, [1, 0])
    features += get_feature_set(neg_file, lexicon, [0, 1])
    random.shuffle(features)
    
    features_length = len(features)
    testing_size = int((test_size*features_length))
    train_data =features[:-testing_size]
    test_data = features[-testing_size:]
    
    return train_data, test_data

def launch():
    train, test = create_input_data('/Users/neelbakshi/Documents/Machine Learning/Datasets/Sentiment Analysis/positive.txt', '/Users/neelbakshi/Documents/Machine Learning/Datasets/Sentiment Analysis/negative.txt')
    return train, test
    
    # if you want to pickle this data:
    with open('/Users/neelbakshi/Documents/Machine Learning/Datasets/Sentiment Analysis/sentiment_set.pickle','wb') as f:
        pickle.dump([train, test],f)
    

In [21]:
train, test = launch()

In [25]:
class Network:
    
    def __init__(self, structure, input_placeholder, output_placeholder):
        self.structure = structure
        self.input_placeholder = input_placeholder
        self.output_placeholder = output_placeholder
        layers = []
        for current_nodes, previous_nodes in zip(self.structure[1:], self.structure[:-1]):
            weights = tf.Variable(tf.random_normal([previous_nodes, current_nodes]))
            biases = tf.Variable(tf.random_normal([current_nodes]))
            layers.append({'weights': weights, 'biases': biases})
            
        layer_output = self.input_placeholder
        for layer in layers[:-1]:
            layer_output = tf.add(tf.matmul(layer_output, layer['weights']), layer['biases'])
            layer_output = tf.nn.relu(layer_output)
        self.final_output = tf.add(tf.matmul(layer_output, layers[-1]['weights']), layers[-1]['biases'])
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.output_placeholder, logits=self.final_output))
        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)
        
    def fit(self, train_data, epochs, batch_size, test_data):
        
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            for epoch in xrange(epochs):
                epoch_loss = 0
                for batch_no in range(int(len(train_data)/batch_size)):
                    epoch_data = train_data[(batch_no*len(train_data)):((batch_no + 1)*len(train_data))]
                    epoch_x = [x[0] for x in train_data]
                    epoch_y = [y[1] for y in train_data]
                    c, _ = sess.run([self.cost, self.optimizer], feed_dict={self.input_placeholder:epoch_x, self.output_placeholder:epoch_y})
                    epoch_loss += c
                print('Epoch', epoch, 'completed out of',epochs,'loss:',epoch_loss)
            
            correct = tf.equal(tf.argmax(self.final_output, 1), tf.argmax(self.output_placeholder, 1))

            accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
            input_test = [x[0] for x in test_data]
            output_test = [y[1] for y in test_data]
            print('Accuracy:',accuracy.eval({self.input_placeholder:input_test, self.output_placeholder:output_test}))
            
            

In [26]:
network = Network([2121, 500, 500, 500, 2], tf.placeholder('float'), tf.placeholder('float'))

In [27]:
network.fit(train, 10, 200, test)

('Epoch', 0, 'completed out of', 10, 'loss:', 54801.202270507812)
('Epoch', 1, 'completed out of', 10, 'loss:', 6011.5266151428223)
('Epoch', 2, 'completed out of', 10, 'loss:', 3653.9004058837891)
('Epoch', 3, 'completed out of', 10, 'loss:', 4166.8995018005371)
('Epoch', 4, 'completed out of', 10, 'loss:', 3682.6616916656494)
('Epoch', 5, 'completed out of', 10, 'loss:', 4110.1790237426758)
('Epoch', 6, 'completed out of', 10, 'loss:', 5697.7448253631592)
('Epoch', 7, 'completed out of', 10, 'loss:', 4722.0098934173584)
('Epoch', 8, 'completed out of', 10, 'loss:', 4035.6975059509277)
('Epoch', 9, 'completed out of', 10, 'loss:', 4200.2036819458008)


NameError: global name 'input_data' is not defined