# IMDB Sentiment Analysis

#### (1) load in training data from folder

In [141]:
import os

pos_examples = [open('../data/train/pos/' + f).read() for f in os.listdir('../data/train/pos')]
neg_examples = [open('../data/train/neg/' + f).read() for f in os.listdir('../data/train/neg')]

X = pos_examples + neg_examples
y = [1 if i < len(pos_examples) else 0 for i in range(len(pos_examples) + len(neg_examples))]

#### (2) perform the following steps of preprocessing:
    (a) remove punctuation
    (b) remove stop words
    (c) stem
    (d) identify simple negations

In [144]:
import re
import math
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import numpy as np

        
def clean(X):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    X_clean = []
    negative = ['not', 'hardly', 'isnt', 'no', 'n\'t', 'never', 'can\'t', 'won\'t', 'don\'t', 'havn\'t', 'didn\'t', 'hasn\'t', 'wouldn\'t', 'couldn\'t', 'shouldn\'t']
    exclams = []
    lengths = []
    uppercases = []
    for x_i in X:
        exclam = x_i.count('!')
        length = len(x_i.split())
        exclams.append(exclam)
        lengths.append(length)
        x_i = re.sub(r'<.*?>', '', x_i)
        tokens = word_tokenize(x_i)
        uppers = len([w for w in tokens if w.isupper() and not w is 'I'])
        uppercases.append(uppers)
        x_i = x_i.lower()
        clean = [stemmer.stem(w) for w in tokens if not w in stop_words or w in negative]
        for c in string.punctuation:
            clean = [w.replace(c, '') for w in clean]
        negated = ['not_' + clean[i] if clean[i-1] in negative else clean[i] for i in range(len(clean))]
        remove_negative = [w for w in negated if w not in negative]
        clean_2 = " ".join(remove_negative)
        X_clean.append(clean_2)
    print('exclams')
    print(np.mean(exclams[:12500]))
    print(np.mean(exclams[12500:]))
    print('lengths')
    print(np.mean(lengths[:12500]))
    print(np.mean(lengths[12500:]))
    print('uppers')
    print(np.mean(uppercases[:12500]))
    print(np.mean(uppercases[12500:]))
    return X_clean

def test_clean(test_text_list):
    print(test_text_list[0])
    test_text_list = clean(test_text_list)
    print(test_text_list[0])
    
test_clean(['<br>Hi this?? ;is</br> not. Hello There DUDe the!! coolest thing I\'ve NEVER ever seen'])
X = clean(X)


<br>Hi this?? ;is</br> not. Hello There DUDe the!! coolest thing I've NEVER ever seen
exclams
2.0
nan
lengths
14.0
nan
uppers
1.0
nan
Hi    not_ hello there dude   coolest thing I ve not_ever seen
exclams
0.0
0.0
lengths
135.8932
132.46808
uppers
0.71536
0.7624


#### (3) split the dataset into 80% train, 20% validate
#### (4) TFIDF vectorization 
        - played around with parameters, using unigrams/bigrams and ~75k features is best
#### (5) Grid search for Logistic regression parameter tuning
        - varied C (inverse regulatization) and penalty (l1 or l2 regularization)

In [153]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix

train_X, validate_X, train_y, validate_y = train_test_split(X, y, test_size=0.1)

def mutual_info(X, y, words):
        if isinstance(X, list):
            X = np.array(X)
        n, m = X.shape
        print(len(words))
        class_counts = {0: 0, 1: 0}
        feature_counts = {0: np.zeros(m), 1: np.zeros(m)}
        sqr_diff = []
        class_probabilities = {}
        feature_probabilities = {}

        for y_i in y:
            class_counts[y_i] += 1

        sparse_matrix = sparse.csr_matrix(X).nonzero()
        (row, col) = sparse_matrix
        for i in range(len(row)):
            c = y[row[i]]
            feature_counts[c][col[i]] += 1

        class_probabilities = {0: class_counts[0]/float(n), 1: class_counts[1]/float(n)}
        feature_probabilities = {
            0: [(feature_count + 1)/float(class_counts[0] + 2) for feature_count in feature_counts[0]],
            1: [(feature_count + 1)/float(class_counts[1] + 2) for feature_count in feature_counts[1]]
        }
        info = []
        for i in range(m):
            prob_word_0 = feature_probabilities[0][i]
            prob_word_1 = feature_probabilities[1][i]
            prob_word = prob_word_0 + prob_word_1
            
            prob_0_1 = prob_word_0*math.log(((prob_word_0)/(prob_word*class_probabilities[0])))
            prob_0_0 = (1 - prob_word_0)*math.log((1 - prob_word_0)/(prob_word*class_probabilities[0]))
            
            prob_1_1 = prob_word_1*math.log((prob_word_1)/(prob_word*class_probabilities[1]))
            prob_1_0 = (1 - prob_word_1)*math.log((1 - prob_word_1))/(prob_word*class_probabilities[1])
            
            info.append((words[i], sum([prob_0_1, prob_0_0, prob_1_1, prob_1_0])))
        return info
    
def count_questions(X):
    return [x_i.count('?')/float(len(x_i.split())) for x_i in X]

questions_train, questions_validate = count_questions(train_X), count_questions(validate_X)

def get_vocab(X_t, y_t):
    stop_words = set(stopwords.words('english'))
    negatives = ['not', 'hardly', 'isnt', 'no', 'n\'t', 'never', 'can\'t', 'won\'t', 'don\'t', 'havn\'t', 'didn\'t', 'hasn\'t', 'wouldn\'t', 'couldn\'t', 'shouldn\'t']
    vectorizer = CountVectorizer(binary=False, analyzer='word', ngram_range=(1, 1), max_features=150000)
    vectorizer_2 = CountVectorizer(binary=False, analyzer='word', ngram_range=(2, 2), max_features=50000)
    
    single_counts = vectorizer.fit_transform(X_t)
    double_counts = vectorizer_2.fit_transform(X_t)
    single_vocab = vectorizer.get_feature_names()
    double_vocab = vectorizer_2.get_feature_names()
    
    info_1 = mutual_info(single_counts, y_t, single_vocab)
    info_2 = mutual_info(double_counts, y_t, double_vocab)
    
    all_info = info_1 + info_2
    all_info = sorted(all_info, key = lambda x: x[1])
    all_info = [x[0] for x in all_info]
    all_info = [w for w in all_info if w not in stop_words and w not in negatives]
    return all_info
    
#VOCAB = get_vocab(train_X, train_y)
#print(VOCAB[:100])
vect_count = CountVectorizer(binary=False, analyzer='word', ngram_range=(1, 2), max_features=250000)

counts = vect_count.fit_transform(train_X)
word_features = vect_count.get_feature_names()
print(len(word_features))

vect_tfidf = TfidfTransformer()

#print(vect_tfidf.get_feature_names())
#nb = BernoulliNB()
#nb.fit(train_features, train_y)

lr = LogisticRegression(solver='liblinear')

grid = {'C': [1, 5, 10, 15, 20, 25, 30]}

lr_cv = GridSearchCV(lr, grid, cv=10, error_score='raise')


250000


### (6) Include both training and validation data in the final model

In [154]:
from sklearn.metrics import classification_report
from scipy.sparse import hstack

vect_tfidf = TfidfTransformer()

lr = LogisticRegression(solver='liblinear')

grid = {'C': [1, 5, 10]}

lr_cv = GridSearchCV(lr, grid, cv=10, error_score='raise')

all_features = hstack(((np.array(questions_train)[:,None], vect_tfidf.fit_transform(counts))))
validate_features = hstack((np.array(questions_validate)[:,None], vect_tfidf.transform(vect_count.transform(validate_X))))
#all_features = vect_tfidf.fit_transform(counts)
#validate_features = vect_tfidf.transform(vect_count.transform(validate_X))
lr_cv.fit(all_features, train_y)
print(lr_cv.best_params_)
print(lr_cv.best_score_)

preds = lr_cv.best_estimator_.predict(validate_features)
print(classification_report(validate_y, preds))


{'C': 10}
0.8998222222222222
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1209
           1       0.89      0.91      0.90      1291

   micro avg       0.89      0.89      0.89      2500
   macro avg       0.89      0.89      0.89      2500
weighted avg       0.89      0.89      0.89      2500



In [147]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from scipy.sparse import csr_matrix, hstack

all_y = train_y + validate_y
documents = [TaggedDocument(doc.split(), [i, all_y[i]]) for i, doc in enumerate(train_X + validate_X)]
model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=1)
print(model.docvecs[0])

feats = [list(model.docvecs[i]) for i in range(len(model.docvecs))]

X_training = feats[:len(train_X)]
X_validating = feats[len(train_X):]
#X_training_sparse = hstack((all_features, X_training))
#X_validating_sparse = hstack((validate_features, X_validating))
                         
print(X_training)
print(X_validating)

lr = LogisticRegression()

lr.fit(X_training, train_y)
pred = lr.predict(X_validating)
print(classification_report(validate_y, pred))


[-10.2965765    6.3056383  -12.041158    -4.344803     4.087066
   2.762565     1.173731     6.3358917   -0.56823426   4.7318172 ]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [148]:

print(model.wv.most_similar('movi'))

[('film', 0.9676274061203003), ('probabl', 0.9330042004585266), ('hammerheadhuman', 0.9208669066429138), ('noteven', 0.9189362525939941), ('mayb', 0.9169903993606567), ('It', 0.912951648235321), ('bullshit', 0.912523090839386), ('yawninduc', 0.911584734916687), ('duplic', 0.9085726737976074), ('sequel', 0.9084675312042236)]


#### (7) Calculate test results and write to file

In [149]:
import csv

test = {}
for n in range(25000):
    filename = '../data/test/{}.txt'.format(n)
    test[n] = open(filename, 'r').read()
    
test_X = list(test.values())
test_X = clean(test_X)
print(test_X[0])

test_counts = vect_count.transform(test_X)
sum_words = test_counts.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vect_count.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

print(words_freq[:100])

test_features = vect_tfidf.transform(test_counts)
predictions = lr_cv.best_estimator_.predict(test_features)

with open('results.csv', 'w') as results:
    writer = csv.writer(results, delimiter=',')
    writer.writerow(['Id', 'Category'])
    for i in range(25000):
        writer.writerow([i, predictions[i]])

exclams
1.0008
0.96752
lengths
226.79408
230.25928
uppers
2.02408
2.07096
think could get better worst assumpt I ever made  drivvl not_describ movi appropri enough  not_plot thin  I get emot act pet fish  It shame see pete postlethwait  I respect actor tri best littl work  I think cardboard cut stephen baldwin would done better job  fact animateavoid cost  thi could realli hazard health 
[('movi', 49543), ('film', 45922), ('nt', 32823), ('one', 25420), ('like', 21506), ('time', 14796), ('make', 13772), ('charact', 13769), ('good', 13671), ('see', 13547), ('watch', 13367), ('get', 13264), ('thi', 13119), ('would', 13042), ('stori', 11595), ('even', 11373), ('realli', 10644), ('scene', 10531), ('show', 9748), ('well', 9456), ('look', 9441), ('could', 9102), ('end', 8953), ('love', 8899), ('great', 8849), ('much', 8829), ('peopl', 8826), ('also', 8634), ('think', 8456), ('play', 8417), ('bad', 8400), ('go', 8352), ('act', 8295), ('first', 8150), ('thing', 8128), ('way', 7852), ('made', 72

ValueError: X has 145869 features per sample; expecting 145870

#### 