In [231]:
# import packages

import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
import xgboost as xgb
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn import model_selection

In [232]:
df_ = pd.read_csv("datasets/IMDB Dataset.csv")
# %10 of data
df_ = df_.sample(frac = 0.1).reset_index(drop = True)

In [233]:
df = df_.copy()

In [234]:
# add another column to dataset (numeric) named review length
df["review_len"] = df["review"].apply(lambda x: len(x) - x.count(" "))


In [235]:
def check_df(xdf, xrow_count=5, xplot=False):
    print("*************** DATASET INFO ************************")
    print("*************** SHAPE ************************")
    print(xdf.shape)
    print("*************** INFO ************************")
    print(xdf.info())
    print("*************** TIPLER ************************")
    print(xdf.dtypes)
    print("*************** HEAD ************************")
    print(xdf.head(xrow_count))
    print("*************** TAIL ************************")
    print(xdf.tail(xrow_count))
    print("*************** Nan Numbers ************************")
    print(xdf.isnull().sum())
    print("*************** Describe Istatics ************************")
    print(xdf.describe().T)
    print("*************** UNIQUE VALUE NUMBERS ************************")
    print(xdf.nunique())
    print("***************  ************************")

In [236]:
check_df(df)

*************** DATASET INFO ************************
*************** SHAPE ************************
(5000, 3)
*************** INFO ************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   review      5000 non-null   object
 1   sentiment   5000 non-null   object
 2   review_len  5000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB
None
*************** TIPLER ************************
review        object
sentiment     object
review_len     int64
dtype: object
*************** HEAD ************************
                                              review sentiment  review_len
0  Difficult to call The Grudge a horror movie. A...  negative         549
1  At the beginning of the film we watch May and ...  positive        1949
2  This movie was made for people who found Greml...  negative        1217
3  Imagin

In [237]:
# Log Transformation

df['review_len_log'] = np.log1p(df['review_len'])

In [238]:
df.head()

# row 0 values (print like table)
# print(df.iloc[:10,:])

Unnamed: 0,review,sentiment,review_len,review_len_log
0,Difficult to call The Grudge a horror movie. A...,negative,549,6.309918
1,At the beginning of the film we watch May and ...,positive,1949,7.575585
2,This movie was made for people who found Greml...,negative,1217,7.104965
3,Imagine the most depressing winter you will ne...,positive,654,6.484635
4,Although I use this site quite frequently to s...,negative,913,6.817831


In [239]:
# one hot encoding for review_len_log with cut function

# df["review_len_log"].describe().T

# Kategorilere ayırmak için aralıkları belirle
bins = [3, 5, 6, 7, 8, 10]

# Kategori etiketlerini belirle
labels = ['a', 'b', 'c', 'd', 'e']

# cut fonksiyonunu kullanarak kategorilere ayır
df['review_len_category'] = pd.cut(df['review_len_log'], bins=bins, labels=labels)

# show random 30 rows
df["review_len_category"].value_counts()

review_len_category
c    2933
d    1411
b     434
e     203
a      19
Name: count, dtype: int64

In [240]:
# drop first column (one hot encoding)
df = pd.get_dummies(df, columns=['review_len_category'], drop_first=True)

In [241]:
df = df.drop(["review_len_log", "review_len"], axis = 1)
df.head()

Unnamed: 0,review,sentiment,review_len_category_b,review_len_category_c,review_len_category_d,review_len_category_e
0,Difficult to call The Grudge a horror movie. A...,negative,False,True,False,False
1,At the beginning of the film we watch May and ...,positive,False,False,True,False
2,This movie was made for people who found Greml...,negative,False,False,True,False
3,Imagine the most depressing winter you will ne...,positive,False,True,False,False
4,Although I use this site quite frequently to s...,negative,False,True,False,False


In [242]:
df.isnull().sum()

review                   0
sentiment                0
review_len_category_b    0
review_len_category_c    0
review_len_category_d    0
review_len_category_e    0
dtype: int64

In [243]:
# positive -> 1, negative -> 0
df["sentiment"] = [1 if i == "positive" else 0 for i in df["sentiment"]]
df["review_len_category_b"] = [1 if i == True else 0 for i in df["review_len_category_b"]]
df["review_len_category_c"] = [1 if i == True else 0 for i in df["review_len_category_c"]]
df["review_len_category_d"] = [1 if i == True else 0 for i in df["review_len_category_d"]]
df["review_len_category_e"] = [1 if i == True else 0 for i in df["review_len_category_e"]]

In [244]:
df.head()

Unnamed: 0,review,sentiment,review_len_category_b,review_len_category_c,review_len_category_d,review_len_category_e
0,Difficult to call The Grudge a horror movie. A...,0,0,1,0,0
1,At the beginning of the film we watch May and ...,1,0,0,1,0
2,This movie was made for people who found Greml...,0,0,0,1,0
3,Imagine the most depressing winter you will ne...,1,0,1,0,0
4,Although I use this site quite frequently to s...,0,0,1,0,0


In [245]:
#create a new column called kfold and fill it with -1
# df['kfold'] = -1
#randomize rows of the data
df = df.sample(frac = 1).reset_index(drop = True)
y = df.sentiment.values #labels

In [246]:
#clean text
def clean_text(text):
    #lowercase every letter
    text = text.split() #split by all white spaces
    
    #join tokens by single space, this will remove all kinds of weird spaces
    text = " ".join(text)
    #removes all punctuation using regex and string module
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    
    return text

df.loc[:,'review'] = df.review.apply(clean_text)

In [247]:
df.head()

Unnamed: 0,review,sentiment,review_len_category_b,review_len_category_c,review_len_category_d,review_len_category_e
0,This is like something I have NEVER seen befor...,1,0,1,0,0
1,Being a fan of the series I thought how bad ca...,0,0,1,0,0
2,Long ago and far away they knew how to make a ...,1,0,1,0,0
3,I saw this pilot when it was first shown and I...,1,0,1,0,0
4,This was my first introduction to the world of...,1,0,1,0,0


In [248]:
# 10-fold cross validation

#X = df.drop(["sentiment", "review_len_log", "review_len"], axis = 1)
X = df.drop(["sentiment"], axis = 1)
y = df["sentiment"]

In [249]:
y

0       1
1       0
2       1
3       1
4       1
       ..
4995    0
4996    1
4997    0
4998    1
4999    0
Name: sentiment, Length: 5000, dtype: int64

In [250]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [251]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words = "english", max_features = 42151)

X_train_tfidf = tfidf.fit_transform(X_train.review)
X_test_tfidf = tfidf.transform(X_test.review)

In [307]:
# XGBoost parameters
params = {
    "objective": "binary:logistic",
    "max_depth": 5,
    "learning_rate": 0.15,
    "verbosity": 0,
    "n_jobs": -1
}

# change parameters to use review and review_len_category_b, review_len_category_c, review_len_category_d, review_len_category_e to affect the model

# XGBoost
model = xgb.XGBClassifier(**params)

#model.fit(X_train_tfidf, y_train) (old)

# use review and review_len_category_b, review_len_category_c, review_len_category_d, review_len_category_e
model.fit(np.concatenate((X_train_tfidf.toarray(), X_train[["review_len_category_b", "review_len_category_c", "review_len_category_d", "review_len_category_e"]].to_numpy()), axis = 1), y_train)

# y_pred = model.predict(X_test_tfidf) (old)

# use review and review_len_category_b, review_len_category_c, review_len_category_d, review_len_category_e
y_pred = model.predict(np.concatenate((X_test_tfidf.toarray(), X_test[["review_len_category_b", "review_len_category_c", "review_len_category_d", "review_len_category_e"]].to_numpy()), axis = 1))

print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1 Score: ", metrics.f1_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("Recall: ", metrics.recall_score(y_test, y_pred))

Accuracy:  0.8113333333333334
F1 Score:  0.8165910563836681
Precision:  0.7797029702970297
Recall:  0.8571428571428571


In [None]:
# use grid search to find best parameters


In [317]:
# sample prediction

text = "I grew up on this movie and I can remember when my brother and I used to play in the backyard and pretend we were in Carealot Now after so many years have passed I get to watch the movie with"

text = clean_text(text)

text = tfidf.transform([text])

review_len_category_b = 1
review_len_category_c = 0
review_len_category_d = 0
review_len_category_e = 0

my_pred = np.concatenate((text.toarray(), np.array([[review_len_category_b, review_len_category_c, review_len_category_d, review_len_category_e]])), axis = 1)

# predict
model.predict(my_pred)[0]

1

In [255]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(text.shape)
# print(category_encoded.shape)
# print(combined_features.shape)

(3500, 41791)
(1500, 41791)
(1, 41791)


In [304]:
df["review_len_category_e"].value_counts()

review_len_category_e
0    4797
1     203
Name: count, dtype: int64

In [313]:
# df[(df["sentiment"] == 0) & (df["review_len_category_b"] == 1) & (df["review_len_category_c"] == 0) & (df["review_len_category_d"] == 0) & (df["review_len_category_e"] == 0)].head()

df[df["sentiment"] == 1].head(30)

Unnamed: 0,review,sentiment,review_len_category_b,review_len_category_c,review_len_category_d,review_len_category_e
0,This is like something I have NEVER seen befor...,1,0,1,0,0
2,Long ago and far away they knew how to make a ...,1,0,1,0,0
3,I saw this pilot when it was first shown and I...,1,0,1,0,0
4,This was my first introduction to the world of...,1,0,1,0,0
5,I like cheap perfume better it doesnt last as ...,1,0,1,0,0
8,If youre interested in learning about the real...,1,1,0,0,0
9,This movie was fun if all over the boardbr br ...,1,0,1,0,0
11,This is a truly remarkable piece of cinematic ...,1,0,1,0,0
13,What would it be like to be accused of being a...,1,1,0,0,0
14,I was hooked from beginning to end Great horro...,1,0,0,1,0


In [33]:
# # sample prediction
# text = "This movie is the best movie I have ever seen. Acting was awesome. \
#         Visual effects were awesome. Storyline was awesome. I loved it. 10/10"
# 
# text = clean_text(text)
# 
# text = tfidf.transform([text])
# 
# model.predict(text)[0]

1

In [34]:
# # sample prediction 2
# text = "The movie wasn't great. Boring story, flat characters, and it just didn't click. Waste of time, to be honest."
# 
# text = clean_text(text)
# 
# text = tfidf.transform([text])
# 
# model.predict(text)[0]

0