# IMDB Sentiment Analysis

#### (1) load in training data from folder

In [1]:
import os

pos_examples = [open('../data/train/pos/' + f).read() for f in os.listdir('../data/train/pos')]
neg_examples = [open('../data/train/neg/' + f).read() for f in os.listdir('../data/train/neg')]

X_ugly = pos_examples + neg_examples
y = [1 if i < len(pos_examples) else 0 for i in range(len(pos_examples) + len(neg_examples))]

#### (2) perform the following steps of preprocessing:
    (a) remove punctuation
    (b) remove stop words
    (c) stem
    (d) identify simple negations

In [7]:
import re
import math
import string
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from collections import Counter
import numpy as np
CONTRACTIONS = {
    "aint": "is not",
    "arent": "are not",
    "cant": "cannot",
    "cantve": "cannot have",
    "cause": "because",
    "couldve": "could have",
    "couldnt": "could not",
    "couldntve": "could not have",
    "didnt": "did not",
    "doesnt": "does not",
    "dont": "do not",
    "hadnt": "had not",
    "hadntve": "had not have",
    "hasnt": "has not",
    "havent": "have not",
    "hed": "he would",
    "hedve": "he would have",
    "hell": "he will",
    "hellve": "he he will have",
    "hes": "he is",
    "howd": "how did",
    "howdy": "how do you",
    "howll": "how will",
    "hows": "how is",
    "Id": "I would",
    "Idve": "I would have",
    "Ill": "I will",
    "Ill've": "I will have",
    "Im": "I am",
    "Ive": "I have",
    "id": "i would",
    "idve": "i would have",
    "ill": "i will",
    "illve": "i will have",
    "im": "i am",
    "ive": "i have",
    "isnt": "is not",
    "itd": "it would",
    "itdve": "it would have",
    "itll": "it will",
    "itllve": "it will have",
    "its": "it is",
    "lets": "let us",
    "maam": "madam",
    "maynt": "may not",
    "mightve": "might have",
    "mightnt": "might not",
    "mightntve": "might not have",
    "mustve": "must have",
    "mustnt": "must not",
    "mustntve": "must not have",
    "neednt": "need not",
    "needntve": "need not have",
    "oclock": "of the clock",
    "oughtnt": "ought not",
    "oughtntve": "ought not have",
    "shant": "shall not",
    "shant": "shall not",
    "shantve": "shall not have",
    "shed": "she would",
    "shedve": "she would have",
    "shell": "she will",
    "shellve": "she will have",
    "shes": "she is",
    "shouldve": "should have",
    "shouldnt": "should not",
    "shouldntve": "should not have",
    "sove": "so have",
    "sos": "so as",
    "thatd": "that would",
    "thatdve": "that would have",
    "thats": "that is",
    "thered": "there would",
    "wasnt": "was not",
    "werent": "were not",
    "wont": "will not",
    "wontve": "will not have",
    "wouldnt": "would not",
    "wouldntve": "would not have",
}
def get_contractions(token):
    if token in CONTRACTIONS.keys():
        token = CONTRACTIONS[token]
    
    return token
        
def clean(X):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    X_clean = []
    negative = ['not']
    exclams = []
    lengths = []
    uppercases = []
    REPLACE_WITH_EMPTY = re.compile("(\,)|(\:)|(\;)|(\()|(\))|(\[)|(\])|(\?)|(\!)|(\')")
    REPLACE_WITH_SPACE = re.compile("(\-)|(\.)")
    for x_i in X:
        x_i = x_i.lower()
        x_i = re.sub(r'<.*?>', ' ', x_i)
        x_i = REPLACE_WITH_EMPTY.sub('', x_i)
        x_i = REPLACE_WITH_SPACE.sub(' ', x_i)
        tokens = word_tokenize(x_i)
        tokens = [get_contractions(t) for t in tokens]
        restring = ' '.join(tokens)
        tokens = word_tokenize(restring)
        clean = [stemmer.stem(w) for w in tokens]
        #clean = [w if not w == 'film' else 'movi' for w in clean]
        #negated = [clean[i-1] + '_' + clean[i] if clean[i-1] in negative else clean[i] for i in range(len(clean))]
        #negated = [w for w in negated if w not in negative]
        #negated = [w for w in negated if w not in negative]
        #pos = nltk.pos_tag(remove_negative)
        #tagged = [w[0] for w in pos if w[1] in set(['NN', 'RB', 'JJS', 'VBP', 'VBN'])]
        #clean_2 = " ".join(tagged)
        #X_clean.append(' '.join(negated))
        X_clean.append(' '.join(clean))
    return X_clean

def test_clean(test_text_list):
    print(test_text_list)
    test_text_list = clean(test_text_list)
    print(test_text_list)
    
test_clean(['i would see this again but not soon. haven\'t reccommended to anyone', '<br>Hi this?? ;is</br> not. Hello There DUDe the!! coolest thing I\'ve NEVER ever seen'])
#print(X_ugly[:1])
X = clean(X_ugly)
#print(X[:1])


["i would see this again but not soon. haven't reccommended to anyone", "<br>Hi this?? ;is</br> not. Hello There DUDe the!! coolest thing I've NEVER ever seen"]
['i would see thi again but not soon have not reccommend to anyon', 'hi thi is not hello there dude the coolest thing i have never ever seen']


#### (3) split the dataset into 80% train, 20% validate
#### (4) TFIDF vectorization 
        - played around with parameters, using unigrams/bigrams and ~75k features is best
#### (5) Grid search for Logistic regression parameter tuning
        - varied C (inverse regulatization) and penalty (l1 or l2 regularization)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix

train_X, validate_X, train_y, validate_y = train_test_split(X_ugly, y, test_size=0.2)

def mutual_info(X, y, words):
        if isinstance(X, list):
            X = np.array(X)
        n, m = X.shape
        print(len(words))
        class_counts = {0: 0, 1: 0}
        feature_counts = {0: np.zeros(m), 1: np.zeros(m)}
        sqr_diff = []
        class_probabilities = {}
        feature_probabilities = {}

        for y_i in y:
            class_counts[y_i] += 1

        sparse_matrix = csr_matrix(X).nonzero()
        (row, col) = sparse_matrix
        for i in range(len(row)):
            c = y[row[i]]
            feature_counts[c][col[i]] += 1

        class_probabilities = {0: class_counts[0]/float(n), 1: class_counts[1]/float(n)}
        feature_probabilities = {
            0: [(feature_count + 1)/float(class_counts[0] + 2) for feature_count in feature_counts[0]],
            1: [(feature_count + 1)/float(class_counts[1] + 2) for feature_count in feature_counts[1]]
        }
        info = []
        for i in range(m):
            prob_word_0 = feature_probabilities[0][i]
            prob_word_1 = feature_probabilities[1][i]
            prob_word = prob_word_0 + prob_word_1
            
            prob_0_1 = prob_word_0*math.log(((prob_word_0)/(prob_word*class_probabilities[0])))
            prob_0_0 = (1 - prob_word_0)*math.log((1 - prob_word_0)/(prob_word*class_probabilities[0]))
            
            prob_1_1 = prob_word_1*math.log((prob_word_1)/(prob_word*class_probabilities[1]))
            prob_1_0 = (1 - prob_word_1)*math.log((1 - prob_word_1))/(prob_word*class_probabilities[1])
            
            info.append((words[i], sum([prob_0_1, prob_0_0, prob_1_1, prob_1_0])))
        return info
    
def count_questions(X):
    return [x_i.count('?')/float(len(x_i.split()) + 1) for x_i in X]

questions_train, questions_validate = count_questions(train_X), count_questions(validate_X)

def get_vocab(X_t, y_t):
    stop_words = set(stopwords.words('english'))
    vectorizer = CountVectorizer(binary=True, analyzer='word', ngram_range=(1, 1), strip_accents='ascii')
    vectorizer_2 = CountVectorizer(binary=True, analyzer='word', ngram_range=(2, 2), strip_accents='ascii')
    
    single_counts = vectorizer.fit_transform(X_t)
    double_counts = vectorizer_2.fit_transform(X_t)
    single_vocab = vectorizer.get_feature_names()
    double_vocab = vectorizer_2.get_feature_names()
    
    info_1 = mutual_info(single_counts, y_t, single_vocab)
    info_2 = mutual_info(double_counts, y_t, double_vocab)
    
    info_1 = sorted(info_1, key = lambda x: x[1])
    info_2 = sorted(info_2, key = lambda x: x[1])
    
    info_all = info_1 + info_2
    info_all = sorted(info_all, key = lambda x: x[1])
    info_all = [x[0] for x in info_all]
    return info_all
    
#VOCAB = get_vocab(train_X + validate_X, train_y + validate_y)[:500000]
#print(VOCAB[:1000])

#vect_tfidf = TfidfTransformer()

#print(vect_tfidf.get_feature_names())
#nb = BernoulliNB()
#nb.fit(train_features, train_y)

#lr = LogisticRegression(solver='liblinear')

#lr_cv = GridSearchCV(lr, grid, cv=10, error_score='raise')


### (6) Include both training and validation data in the final model

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import vstack, hstack

vect_count = CountVectorizer(binary=False, dtype=int, analyzer='word', ngram_range=(1, 3), max_features=1000000)
counts = vect_count.fit_transform(train_X)
#print(vect_count.get_feature_names()[400000:4001000])
#overlap = set(vect_count.get_feature_names()).difference(set(VOCAB))
#print(overlap)
vect_tfidf = TfidfTransformer()

lr = LogisticRegression(solver='liblinear')

grid = {'C': [40000, 50000, 100000, 500000]}

lr_cv = GridSearchCV(lr, grid, cv=10, error_score='raise')

#vocabs = VOCAB[:int(i*len(VOCAB))]
#print(len(vocabs))
#train_features = hstack(((np.array(questions_train)[:,None], vect_tfidf.fit_transform(counts))))
#validate_features = hstack((np.array(questions_validate)[:,None], vect_tfidf.transform(vect_count.transform(validate_X))))
train_features = vect_tfidf.fit_transform(counts)
validate_counts = vect_count.transform(validate_X)
validate_features = vect_tfidf.transform(validate_counts)
lr_cv.fit(train_features, train_y)

print(lr_cv.cv_results_['mean_test_score'])

preds = lr_cv.best_estimator_.predict(validate_features)
print('{}'.format(accuracy_score(validate_y, preds)))

#all_features = vstack((train_features, validate_features))
#all_y = train_y + validate_y

all_features = vect_tfidf.fit_transform(vect_count.fit_transform(X_ugly))

lr_cv.fit(all_features, y)
print(lr_cv.cv_results_['mean_test_score'])

In [None]:
import csv

test = {}
for n in range(25000):
    filename = '../data/test/{}.txt'.format(n)
    test[n] = open(filename, 'r').read()
    
test_X = list(test.values())
test_X = clean(test_X)
print(test_X[0])

test_counts = vect_count.transform(test_X)

#sum_words = test_counts.sum(axis=0)
#words_freq = [(word, sum_words[0, idx]) for word, idx in vect_count.vocabulary_.items()]
#words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
#print(words_freq[:100])

test_features = vect_tfidf.transform(test_counts)
predictions = lr_cv.best_estimator_.predict(test_features)

print(test_features.shape)
with open('results.csv', 'w') as results:
    writer = csv.writer(results, delimiter=',')
    writer.writerow(['Id', 'Category'])
    for i in range(25000):
        writer.writerow([i, predictions[i]])

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from scipy.sparse import csr_matrix, hstack

all_y = train_y + validate_y
documents = [TaggedDocument(doc.split(), [i, all_y[i]]) for i, doc in enumerate(train_X + validate_X)]
model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, workers=1)
print(model.docvecs[0])

feats = [list(model.docvecs[i]) for i in range(len(model.docvecs))]

X_training = feats[:len(train_X)]
X_validating = feats[len(train_X):]
X_training_sparse = hstack((train_features, X_training))
X_validating_sparse = hstack((validate_features, X_validating))

lr = LogisticRegression()

lr.fit(X_training_sparse)
pred = lr.predict(X_validating_sparse)
print(accuracy_score(validate_y, pred))


In [None]:

print(model.wv.most_similar('movi'))

#### (7) Calculate test results and write to file

#### 