In [1]:
import numpy as np
import pandas as pd
import nltk

## Dataset link
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
imdb_data = pd.read_csv("IMDB Dataset.csv",nrows=20000)

In [3]:
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     20000 non-null  object
 1   sentiment  20000 non-null  object
dtypes: object(2)
memory usage: 312.6+ KB


In [5]:
imdb_data.shape

(20000, 2)

In [6]:
feature = imdb_data['review']
target = imdb_data['sentiment']

In [7]:
feature.shape

(20000,)

In [8]:
target.shape

(20000,)

In [9]:
target.value_counts()

negative    10097
positive     9903
Name: sentiment, dtype: int64

In [10]:
import seaborn as sns

In [11]:
import re

In [12]:
def data_cleaning(text):
    clean_text = re.sub('[^A-Za-z]+'," ",text)
    return clean_text

In [13]:
imdb_data['review'] = imdb_data['review'].apply(lambda text : data_cleaning(text))

In [14]:
imdb_data['review'] = imdb_data['review'].apply(lambda text : text.lower())

In [15]:
imdb_data.head(15)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive
5,probably my all time favorite movie a story of...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative idea...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [16]:
imdb_data['token_text'] = imdb_data['review'].apply(lambda text : text.split())

In [17]:
imdb_data.head(10)

Unnamed: 0,review,sentiment,token_text
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production br br the filmin...,positive,"[a, wonderful, little, production, br, br, the..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there s a family where a little boy ...,negative,"[basically, there, s, a, family, where, a, lit..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, s, love, in, the, time, of, m..."
5,probably my all time favorite movie a story of...,positive,"[probably, my, all, time, favorite, movie, a, ..."
6,i sure would like to see a resurrection of a u...,positive,"[i, sure, would, like, to, see, a, resurrectio..."
7,this show was an amazing fresh innovative idea...,negative,"[this, show, was, an, amazing, fresh, innovati..."
8,encouraged by the positive comments about this...,negative,"[encouraged, by, the, positive, comments, abou..."
9,if you like original gut wrenching laughter yo...,positive,"[if, you, like, original, gut, wrenching, laug..."


In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords

In [20]:
stop_words = stopwords.words('english')

In [21]:
imdb_data['token_text'][0]

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 'oz',
 'episode',
 'you',
 'll',
 'be',
 'hooked',
 'they',
 'are',
 'right',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 'br',
 'br',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 'trust',
 'me',
 'this',
 'is',
 'not',
 'a',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 'sex',
 'or',
 'violence',
 'its',
 'is',
 'hardcore',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 'br',
 'br',
 'it',
 'is',
 'called',
 'oz',
 'as',
 'that',
 'is',
 'the',
 'nickname',
 'given',
 'to',
 'the',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'it',
 'fo

In [22]:
def remove_stopwords(token_text):
    words = []
    for word in token_text:
        if word not in stop_words:
            words.append(word)
            
    return words

In [23]:
imdb_data['token_text'] = imdb_data['token_text'].apply(lambda token_text : remove_stopwords(token_text))

In [24]:
imdb_data['token_text'] = imdb_data['token_text'].apply(lambda token_text:[word for word in token_text if word not in stop_words])

In [25]:
from nltk.stem import PorterStemmer    
ps = PorterStemmer()

In [26]:
def porter_stemming(token_text):
    
    stem_words=[]
    for word in token_text:
        stem_word = ps.stem(word)
        stem_words.append(stem_word)
        
    return stem_words

In [27]:
imdb_data['stem_text'] = imdb_data['token_text'].apply(lambda token_text : porter_stemming(token_text))

In [28]:
imdb_data['stem_text'] = imdb_data['token_text'].apply(lambda text: [ps.stem(word) for word in text]) # stemming
imdb_data.head()

Unnamed: 0,review,sentiment,token_text,stem_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...","[one, review, mention, watch, oz, episod, hook..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...","[wonder, littl, product, br, br, film, techniq..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...","[basic, famili, littl, boy, jake, think, zombi..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...","[petter, mattei, love, time, money, visual, st..."


In [29]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wl=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
def wordnet_lemma(token_text):
    lemma_words=[]
    for word in token_text:
        lemma_word = wl.lemmatize(word,pos='v')
        lemma_words.append(lemma_word)
        
    return lemma_words

In [31]:
imdb_data['lemma_text'] = imdb_data['token_text'].apply(lambda token_text : wordnet_lemma(token_text))

In [32]:
imdb_data['lemma_text'] = imdb_data['token_text'].apply(lambda token_text : [wl.lemmatize(word,pos='v') for word in token_text])

In [33]:
imdb_data['stem_text'] = imdb_data['stem_text'].apply(lambda x: ' '.join(x))

In [34]:
imdb_data.head(10)

Unnamed: 0,review,sentiment,token_text,stem_text,lemma_text
0,one of the other reviewers has mentioned that ...,positive,"[one, reviewers, mentioned, watching, oz, epis...",one review mention watch oz episod hook right ...,"[one, reviewers, mention, watch, oz, episode, ..."
1,a wonderful little production br br the filmin...,positive,"[wonderful, little, production, br, br, filmin...",wonder littl product br br film techniqu unass...,"[wonderful, little, production, br, br, film, ..."
2,i thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...",thought wonder way spend time hot summer weeke...,"[think, wonderful, way, spend, time, hot, summ..."
3,basically there s a family where a little boy ...,negative,"[basically, family, little, boy, jake, thinks,...",basic famili littl boy jake think zombi closet...,"[basically, family, little, boy, jake, think, ..."
4,petter mattei s love in the time of money is a...,positive,"[petter, mattei, love, time, money, visually, ...",petter mattei love time money visual stun film...,"[petter, mattei, love, time, money, visually, ..."
5,probably my all time favorite movie a story of...,positive,"[probably, time, favorite, movie, story, selfl...",probabl time favorit movi stori selfless sacri...,"[probably, time, favorite, movie, story, selfl..."
6,i sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...",sure would like see resurrect date seahunt ser...,"[sure, would, like, see, resurrection, date, s..."
7,this show was an amazing fresh innovative idea...,negative,"[show, amazing, fresh, innovative, idea, first...",show amaz fresh innov idea first air first yea...,"[show, amaze, fresh, innovative, idea, first, ..."
8,encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...",encourag posit comment film look forward watch...,"[encourage, positive, comment, film, look, for..."
9,if you like original gut wrenching laughter yo...,positive,"[like, original, gut, wrenching, laughter, lik...",like origin gut wrench laughter like movi youn...,"[like, original, gut, wrench, laughter, like, ..."


In [49]:
imdb_data['cleaned_text'] = imdb_data['lemma_text'].apply(lambda text: ' '.join(text))

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
imdb_vectorizer = CountVectorizer()

imdb_features   = imdb_vectorizer.fit_transform(imdb_data['cleaned_text'])
imdb_features.get_shape()

(20000, 57892)

In [51]:
X = imdb_features    #### Features
y = imdb_data['sentiment']  #### Target

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [53]:
print('Training set :', X_train.shape)
print('Testing set :', X_test.shape)

Training set : (15000, 57892)
Testing set : (5000, 57892)


In [54]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion = 'entropy')

In [55]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [56]:
y_pred =  clf.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=clf.predict(X_train)))
print('Accuracy Score on test data: ', accuracy_score(y_true=y_test, y_pred=y_pred))

Accuracy Score on train data:  1.0
Accuracy Score on test data:  0.713
