In [30]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package punkt to /home/nihal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nihal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
df = pd.read_csv("imdb_small.csv")
df.head()

Unnamed: 0,review,sentiment
0,I have to differ from the other comments poste...,negative
1,I saw this movie with low expectations and was...,negative
2,Taran Adarsh a reputed critic praised such a d...,negative
3,When I first heard that the subject matter for...,positive
4,"With the release of Peter Jackson's famed ""Lor...",positive


In [32]:
df.describe()
df.count()[0]

5000

In [33]:
# taking care of symbols and HTML line breaks in the review
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
#REPLACE_NO_SPACE = re.compile("[.;:\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_review(review):
    new_text = REPLACE_NO_SPACE.sub("", review.lower())
    new_text = REPLACE_WITH_SPACE.sub(" ", new_text)
    
    return new_text

# removing stopwords
english_stop_words = stopwords.words('english')
def remove_stop_words(review):
    return ' '.join([word for word in review.split() if word not in english_stop_words])

# tokenization
def tokenization_w(words):
    w_new = []
    for w in (words[:][0]):  # for NumPy = words[:]
        w_token = word_tokenize(w)
        if w_token != '':
            w_new.append(w_token)
    return w_new

# Lemmatization
def get_lemmatized_text(review):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in review.split()])
    #return [lemmatizer.lemmatize(word) for word in review.split()]


# changing negative to 0, positive to 1
def text2bool(val):
    if val[0] == 'p':
        return 1
    else:
        return 0

In [34]:
#X_train = preprocess_reviews(X_train)
df['review'] = df['review'].apply(preprocess_review)
df['review'] = df['review'].apply(remove_stop_words)
df['review'] = df['review'].apply(get_lemmatized_text)
df['sentiment'] = df['sentiment'].apply(text2bool)
#df['review'] = df['review'].apply(preprocess_review)


In [35]:
df.head()

Unnamed: 0,review,sentiment
0,differ comment posted amid sporadic funny mome...,0
1,saw movie low expectation disappointed bad act...,0
2,taran adarsh reputed critic praised dubba movi...,0
3,first heard subject matter checking self orche...,1
4,release peter jackson famed lord ring trilogy ...,1


In [36]:
# dropping duplicates as indicated above
df = df.drop_duplicates()

In [37]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))

X_train, X_val, y_train, y_val = train_test_split(df['review'], df['sentiment'], train_size = 0.8, stratify=df['sentiment'], random_state=123, shuffle=True)
#X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size = 0.5, stratify=y_val, random_state=123, shuffle=True)

ngram_vectorizer.fit(X_train)
X_train = ngram_vectorizer.transform(X_train)
X_val = ngram_vectorizer.transform(X_val)

#X_train, X_val, y_train, y_val = train_test_split(df['review'], df['sentiment'], train_size = 0.8)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(X_val))))
    


Accuracy for C=0.01: 0.842
Accuracy for C=0.05: 0.851
Accuracy for C=0.25: 0.854
Accuracy for C=0.5: 0.854
Accuracy for C=1: 0.854


In [38]:
final_ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
final_ngram_vectorizer.fit(df['review'])
final_X_train = final_ngram_vectorizer.transform(df['review'])
final_y_train = df['sentiment']

final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(final_X_train, final_y_train) 

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
test_df = pd.read_csv("test.csv")
test_df['review'] = test_df['review'].apply(preprocess_review)
test_df['review'] = test_df['review'].apply(remove_stop_words)
test_df['review'] = test_df['review'].apply(get_lemmatized_text)
test_df.describe()

Unnamed: 0,review
count,13
unique,13
top,consider bit connoisseur boxing movie one thin...
freq,1


In [44]:
X_test = final_ngram_vectorizer.transform(test_df['review'])
final_ngram.predict(X_test)

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0])