In [35]:
#importing libraries
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size = .50) 

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nathanaelnam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [6]:
def sentiment_to_number(text):
    if text == 'positive':
        return 1
    else:
        return 0

In [7]:
y_train = y_train.apply(sentiment_to_number)
y_test = y_test.apply(sentiment_to_number)

In [8]:
#text pre processing
for i in range(len(X_train)):
    review = re.sub('[^a-zA-Z]', ' ', X_train[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    X_train[i] = review

#text pre processing
for i in range(len(X_test)):
    review = re.sub('[^a-zA-Z]', ' ', X_test[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords)]
    review = ' '.join(review)
    X_test[i] = review

In [9]:
X_train

0        seventh sign borrows lot rosemary baby omen ac...
1        example genius filmaking epic story three majo...
2        omen awakening made television sequel original...
3        gundam became movie trilogy u familiar lot she...
4        opening scene really got watching movie howeve...
                               ...                        
24995    adored fan unusually charming creativity holly...
24996    film making simplest best br br doubt even tho...
24997    saw alien v predator year ago say stupid seque...
24998    movie amusing time hell sometimes even downrig...
24999    many many year gaijin visited japan learning m...
Name: review, Length: 25000, dtype: object

In [10]:
#tf idf
tf_idf = TfidfVectorizer(analyzer = 'word')
#applying tf idf to training data
X_train_tf = tf_idf.fit_transform(X_train)
#applying tf idf to training data
X_train_tf = tf_idf.transform(X_train)

print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 25000, n_features: 67225


In [11]:
X_test_tf = tf_idf.transform(X_test)

print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 25000, n_features: 67225


In [12]:
# Logistic Regression 
logreg = LogisticRegression()
logreg.fit(X_train_tf, y_train)

# predicted y
y_pred = logreg.predict(X_test_tf)
accuracy_score(y_test, y_pred)

0.88872

In [37]:
print(f1_score(y_test, y_pred))

0.8898828372387586


In [33]:
print(metrics.confusion_matrix(y_test, y_pred))

[[10977  1537]
 [ 1245 11241]]


In [38]:
# KNN
KNN = KNeighborsClassifier()
KNN.fit(X_train_tf, y_train)

y_pred_KNN = KNN.predict(X_test_tf)
accuracy_score(y_test, y_pred_KNN)

0.7576

In [39]:
print(f1_score(y_test, y_pred_KNN))

0.7625019595547893


In [42]:
print(metrics.confusion_matrix(y_test, y_pred_KNN))

[[9212 3302]
 [2758 9728]]


In [16]:
# Decision Tree
decision_tree = DecisionTreeClassifier(max_depth = 15)
decision_tree = decision_tree.fit(X_train_tf, y_train)
y_pred_tree = decision_tree.predict(X_test_tf)
accuracy_score(y_test, y_pred_tree)

0.73364

In [36]:
print(f1_score(y_test, y_pred_tree))

0.7586531840092783


In [34]:
print(metrics.confusion_matrix(y_test, y_pred_tree))

[[ 7875  4639]
 [ 2020 10466]]


In [18]:
# Random Forests
random_forest = RandomForestClassifier(n_estimators = 50, max_depth = 15)
random_forest = random_forest.fit(X_train_tf, y_train)
y_pred_forest = random_forest.predict(X_test_tf)
accuracy_score(y_test, y_pred_forest)

0.81916

In [41]:
print(f1_score(y_test, y_pred_forest))

0.8253968253968254


In [40]:
print(metrics.confusion_matrix(y_test, y_pred_forest))

[[ 9793  2721]
 [ 1800 10686]]
