In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from wordcloud import WordCloud, STOPWORDS
from bs4 import BeautifulSoup

from textblob import TextBlob
from textblob import Word

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import re, string, unicodedata, os, warnings

warnings.filterwarnings('ignore')

In [2]:
imdbDF = pd.read_csv('data/IMDB.csv')
imdbDF.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdbDF.shape

(50000, 2)

In [4]:
imdbDF.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
imdbDF['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
# Clean data
def cleanTxt(txt):
    # removing html
    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()
    # removing square brackets
    txt = re.sub('\[[^]]*\]', '', txt)
    # removing special characters
    pattern=r'[^a-zA-z0-9\s]'
    txt = re.sub(pattern, '', txt)
    return txt

# Apply function on review column
imdbDF['review'] = imdbDF['review'].apply(cleanTxt)

In [7]:
# Stemming

def stemming(txt):
    ps = nltk.porter.PorterStemmer()
    txt = ' '.join([ps.stem(word) for word in txt.split()])
    return txt

imdbDF['review'] = imdbDF['review'].apply(stemming)

In [8]:
# Removing stopwords
tokenizer = ToktokTokenizer()

stopwords = nltk.corpus.stopwords.words('english')

In [9]:
def remove_stopwords(txt):
    tokens = tokenizer.tokenize(txt)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    txt = ' '.join(filtered_tokens)
    return txt

imdbDF['review'] = imdbDF['review'].apply(remove_stopwords)

In [10]:
imdbDF.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,positive
1,wonder littl product film techniqu veri unassu...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


In [11]:
imdbDFProcessed = imdbDF.copy()

In [12]:
lb = LabelBinarizer()
imdbDFProcessed['sentiment'] = lb.fit_transform(imdbDFProcessed['sentiment'])
imdbDFProcessed.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,1
1,wonder littl product film techniqu veri unassu...,1
2,thought thi wa wonder way spend time hot summe...,1
3,basic famili littl boy jake think zombi hi clo...,0
4,petter mattei love time money visual stun film...,1


In [13]:
imdbDFProcessed.iloc[0]

review       one review ha mention watch 1 Oz episod youll ...
sentiment                                                    1
Name: 0, dtype: object

In [14]:
X = imdbDFProcessed['review']
y = imdbDFProcessed["sentiment"]

In [15]:
type(X)

pandas.core.series.Series

In [16]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))
# cv = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
Xcv = cv.fit_transform(X)

In [17]:
Xcv.shape

(50000, 7528779)

In [18]:
Xcv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
type(y)

pandas.core.series.Series

In [20]:
X_train, X_test, y_train, y_test = train_test_split(Xcv, y, test_size=0.3, random_state=4)

In [21]:
# training the model
lr = LogisticRegression(penalty='l2', C=1, random_state=0)

# Fitting the model for Bag of words
lr.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
y_predict = lr.predict(X_test)

In [23]:
score = accuracy_score(y_test, y_predict)
score

0.5036666666666667