In [23]:
# import packages

import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
import xgboost as xgb
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn import model_selection

In [27]:
df_ = pd.read_csv("datasets/IMDB Dataset.csv")
# %10 of data
df_ = df_.sample(frac = 0.1).reset_index(drop = True)

In [28]:
df = df_.copy()

In [29]:
df.head()

Unnamed: 0,review,sentiment
0,"Watched this on KQED, with Frank Baxter commen...",positive
1,"This is the first movie I ever owned on video,...",positive
2,I have seen this movie when it was released an...,positive
3,"Red Eye, a movie that id had wanted to see for...",positive
4,Is the Cannes controversy-meter remarkably eso...,negative


In [30]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    #print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    
check_df(df)

##################### Shape #####################
(5000, 2)
##################### Types #####################
review       object
sentiment    object
dtype: object
##################### Head #####################
                                              review sentiment
0  Watched this on KQED, with Frank Baxter commen...  positive
1  This is the first movie I ever owned on video,...  positive
2  I have seen this movie when it was released an...  positive
3  Red Eye, a movie that id had wanted to see for...  positive
4  Is the Cannes controversy-meter remarkably eso...  negative
##################### Tail #####################
                                                 review sentiment
4995  The Life and Time of Little Richard, as told b...  positive
4996  Gorgeous Techicolor production telling the unu...  positive
4997  Quite one of the worst films I have ever seen....  negative
4998  Your first clue that this is a cheesy movie is...  negative
4999  This movie is a gem...an

In [31]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [32]:
# positive -> 1, negative -> 0
df["sentiment"] = [1 if i == "positive" else 0 for i in df["sentiment"]]

In [33]:
df.head()

Unnamed: 0,review,sentiment
0,"Watched this on KQED, with Frank Baxter commen...",1
1,"This is the first movie I ever owned on video,...",1
2,I have seen this movie when it was released an...,1
3,"Red Eye, a movie that id had wanted to see for...",1
4,Is the Cannes controversy-meter remarkably eso...,0


In [10]:
#create a new column called kfold and fill it with -1
df['kfold'] = -1
#randomize rows of the data
df = df.sample(frac = 1).reset_index(drop = True)
y = df.sentiment.values #labels

In [34]:
#clean text
def clean_text(text):
    #lowercase every letter
    text = text.split() #split by all white spaces
    
    #join tokens by single space, this will remove all kinds of weird spaces
    text = " ".join(text)
    #removes all punctuation using regex and string module
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    
    return text

df.loc[:,'review'] = df.review.apply(clean_text)

In [37]:
df.head()

Unnamed: 0,review,sentiment
0,Watched this on KQED with Frank Baxter comment...,1
1,This is the first movie I ever owned on video ...,1
2,I have seen this movie when it was released an...,1
3,Red Eye a movie that id had wanted to see for ...,1
4,Is the Cannes controversymeter remarkably esot...,0


In [38]:
# 10-fold cross validation

y = df["sentiment"]
X = df.drop(["sentiment"], axis = 1)

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [55]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words = "english", max_features = 50000)

X_train = tfidf.fit_transform(X_train.review)
X_test = tfidf.transform(X_test.review)

In [56]:
# XGBoost parameters
params = {
    "objective": "binary:logistic",
    "max_depth": 3,
    "learning_rate": 0.3,
    "verbosity": 0,
    "n_jobs": -1
}

# XGBoost
model = xgb.XGBClassifier(**params)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1 Score: ", metrics.f1_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))

Accuracy:  0.8253333333333334
F1 Score:  0.8352201257861636
Precision:  0.8127294981640147
Recall:  0.8589909443725744


In [58]:
# sample prediction
text = "This movie is the best movie I have ever seen. Acting was awesome. \
        Visual effects were awesome. Storyline was awesome. I loved it. 10/10"

text = clean_text(text)

text = tfidf.transform([text])

model.predict(text)[0]

1

In [61]:
# sample prediction 2
text = "The movie wasn't great. Boring story, flat characters, and it just didn't click. Waste of time, to be honest."

text = clean_text(text)

text = tfidf.transform([text])

model.predict(text)[0]

0