In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
import warnings
warnings.filterwarnings('ignore')

In [6]:
imdbDF = pd.read_csv('data/IMDB.csv')
imdbDF.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
imdbDF.shape

(50000, 2)

In [8]:
imdbDF.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [9]:
imdbDF['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [10]:
# Clean data
def cleanTxt(txt):
    # removing html
    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()
    # removing square brackets
    txt = re.sub('\[[^]]*\]', '', txt)
    # removing special characters
    pattern=r'[^a-zA-z0-9\s]'
    txt = re.sub(pattern, '', txt)
    return txt

# Apply function on review column
imdbDF['review'] = imdbDF['review'].apply(cleanTxt)

In [7]:
# Stemming

def stemming(txt):
    ps = nltk.porter.PorterStemmer()
    txt = ' '.join([ps.stem(word) for word in txt.split()])
    return txt

imdbDF['review'] = imdbDF['review'].apply(stemming)

In [52]:
# Removing stopwords
tokenizer = ToktokTokenizer()
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

In [53]:
def remove_stopwords(txt):
    tokens = tokenizer.tokenize(txt)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    txt = ' '.join(filtered_tokens)
    return txt

imdbDF['review'] = imdbDF['review'].apply(remove_stopwords)

In [11]:
imdbDF.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [55]:
imdbDFProcessed = imdbDF.copy()

In [56]:
lb = LabelBinarizer()
imdbDFProcessed['sentiment'] = lb.fit_transform(imdbDFProcessed['sentiment'])
imdbDFProcessed.head()

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,Basically theres family little boy Jake thinks...,0
4,Petter Matteis Love Time Money visually stunni...,1


In [57]:
imdbDFProcessed.iloc[0]

review       One reviewers mentioned watching 1 Oz episode ...
sentiment                                                    1
Name: 0, dtype: object

In [58]:
X = imdbDFProcessed['review']
y = imdbDFProcessed["sentiment"]

In [59]:
type(X)

pandas.core.series.Series

In [60]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))
# cv = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
Xcv = cv.fit_transform(X)

In [61]:
Xcv.shape

(50000, 8132309)

In [17]:
Xcv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
type(y)

pandas.core.series.Series

In [62]:
X_train, X_test, y_train, y_test = train_test_split(Xcv, y, test_size=0.3, random_state=4)

In [66]:
# training the model
lr = LogisticRegression(penalty='l2', C=1, random_state=0)

# Fitting the model for Bag of words
lr.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
y_predict = lr.predict(X_test)

In [69]:
score = accuracy_score(y_test, y_predict)
score

0.5036666666666667