In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import string
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv("train.csv")

# Data Understanding

In [3]:
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
data.shape

(40000, 2)

In [5]:
data.sample(10)

Unnamed: 0,text,label
24654,I only bought this DVD because it was dirt che...,0
3283,This is a romantic comedy with the emphasis on...,1
8366,This is a wonderful comedy short--one of Keato...,1
35069,definitely the best game for N64 ever. I most ...,1
16936,I'm both amused and disgusted by the people wh...,0
2327,I was very interested in seeing this movie des...,0
7787,"ONCE UPON A TIME, there were different types o...",1
9665,I had read many good things about this adaptat...,0
12264,I first remember seeing this one back in the 7...,1
36561,My complaints here concern the movie's pacing ...,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [7]:
data.isnull().mean()*100

text     0.0
label    0.0
dtype: float64

In [8]:
data.duplicated().sum()

277

In [9]:
data[data["text"].duplicated()].head()

Unnamed: 0,text,label
2164,"I find it rather useless to comment on this ""m...",0
3236,An absolutely atrocious adaptation of the wond...,0
3428,When I first saw this film it was not an impre...,0
3652,Before I watched this tv movie I did not know ...,1
3787,What a clunker!<br /><br />It MUST have been m...,0


In [10]:
data.drop_duplicates(inplace=True)

# Data Cleaning / Preprocessing

In [11]:
data["text"][0]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.'

In [12]:
data["label"].value_counts()

1    19908
0    19815
Name: label, dtype: int64

In [13]:
#TOKENISATION AND REMOVING PUNCTUATIONS

def preprocessing1(text):
    tokens = word_tokenize(text)
    
    punctuations = string.punctuation
    txt = []
    for i in tokens:
        if i not in punctuations:
            txt.append(i)
    return txt

In [14]:
data["text"] = data["text"].apply(preprocessing1)

In [15]:
data["text"][0]

['I',
 'grew',
 'up',
 'b',
 '1965',
 'watching',
 'and',
 'loving',
 'the',
 'Thunderbirds',
 'All',
 'my',
 'mates',
 'at',
 'school',
 'watched',
 'We',
 'played',
 '``',
 'Thunderbirds',
 "''",
 'before',
 'school',
 'during',
 'lunch',
 'and',
 'after',
 'school',
 'We',
 'all',
 'wanted',
 'to',
 'be',
 'Virgil',
 'or',
 'Scott',
 'No',
 'one',
 'wanted',
 'to',
 'be',
 'Alan',
 'Counting',
 'down',
 'from',
 '5',
 'became',
 'an',
 'art',
 'form',
 'I',
 'took',
 'my',
 'children',
 'to',
 'see',
 'the',
 'movie',
 'hoping',
 'they',
 'would',
 'get',
 'a',
 'glimpse',
 'of',
 'what',
 'I',
 'loved',
 'as',
 'a',
 'child',
 'How',
 'bitterly',
 'disappointing',
 'The',
 'only',
 'high',
 'point',
 'was',
 'the',
 'snappy',
 'theme',
 'tune',
 'Not',
 'that',
 'it',
 'could',
 'compare',
 'with',
 'the',
 'original',
 'score',
 'of',
 'the',
 'Thunderbirds',
 'Thankfully',
 'early',
 'Saturday',
 'mornings',
 'one',
 'television',
 'channel',
 'still',
 'plays',
 'reruns',
 'of',

In [16]:
#REMOVING STOPWORDS & CONVERTING TO LOWERCASE

def preprocessing2(text):
    stopword = stopwords.words("english")
    ntxt = []
    for j in text:
        if j not in stopword:
            ntxt.append(j.lower())
    return ntxt

In [17]:
data["text"] = data["text"].apply(preprocessing2)

In [18]:
data["text"][0]

['i',
 'grew',
 'b',
 '1965',
 'watching',
 'loving',
 'thunderbirds',
 'all',
 'mates',
 'school',
 'watched',
 'we',
 'played',
 '``',
 'thunderbirds',
 "''",
 'school',
 'lunch',
 'school',
 'we',
 'wanted',
 'virgil',
 'scott',
 'no',
 'one',
 'wanted',
 'alan',
 'counting',
 '5',
 'became',
 'art',
 'form',
 'i',
 'took',
 'children',
 'see',
 'movie',
 'hoping',
 'would',
 'get',
 'glimpse',
 'i',
 'loved',
 'child',
 'how',
 'bitterly',
 'disappointing',
 'the',
 'high',
 'point',
 'snappy',
 'theme',
 'tune',
 'not',
 'could',
 'compare',
 'original',
 'score',
 'thunderbirds',
 'thankfully',
 'early',
 'saturday',
 'mornings',
 'one',
 'television',
 'channel',
 'still',
 'plays',
 'reruns',
 'series',
 'gerry',
 'anderson',
 'wife',
 'created',
 'jonatha',
 'frakes',
 'hand',
 'directors',
 'chair',
 'version',
 'completely',
 'hopeless',
 'a',
 'waste',
 'film',
 'utter',
 'rubbish',
 'a',
 'cgi',
 'remake',
 'may',
 'acceptable',
 'replacing',
 'marionettes',
 'homo',
 'sap

In [19]:
#STEMMING

def preprocessing3(text):
    stem = []
    Stemmer = PorterStemmer()
    for i in text:
        stem.append(Stemmer.stem(i))
    return " ".join(stem)

In [20]:
data["text"] = data["text"].apply(preprocessing3)

In [21]:
data["text"][0]

"i grew b 1965 watch love thunderbird all mate school watch we play `` thunderbird '' school lunch school we want virgil scott no one want alan count 5 becam art form i took children see movi hope would get glimps i love child how bitterli disappoint the high point snappi theme tune not could compar origin score thunderbird thank earli saturday morn one televis channel still play rerun seri gerri anderson wife creat jonatha frake hand director chair version complet hopeless a wast film utter rubbish a cgi remak may accept replac marionett homo sapien subsp sapien huge error judgment"

In [22]:
# REMOVING HTML TAGS

def preprocessing4(text):
    pattern = re.compile("<.*?>")
    return pattern.sub(r'',text)

In [23]:
data["text"] = data["text"].apply(preprocessing4)

In [24]:
data.head(10)

Unnamed: 0,text,label
0,i grew b 1965 watch love thunderbird all mate ...,0
1,when i put movi dvd player sat coke chip i exp...,0
2,whi peopl know particular time past like feel ...,0
3,even though i great interest biblic movi i bor...,0
4,im die hard dad armi fan noth ever chang i got...,1
5,a terribl movi everyon said what made laugh ca...,0
6,final watch shock movi last night disturb mind...,1
7,i caught film azn cabl it sound like would goo...,0
8,it may remak 1987 autumn 's tale eleven year d...,1
9,my super ex girlfriend turn pleasant surpris i...,1


In [25]:
data.isna().sum()

text     0
label    0
dtype: int64

# MODEL BUILDING

In [26]:
x = data["text"]
y = data["label"]

In [27]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(x)

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)

**Logistic Regression**

In [29]:
logr = LogisticRegression()
model = logr.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

88.60918816865954

In [30]:
score = []
for i in range(20):
    x = data["text"]
    y = data["label"]
    x = tfidf.fit_transform(x)
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=i)
    
    logr = LogisticRegression()
    logr.fit(x_train, y_train)
    pred = logr.predict(x_test)
    score.append(accuracy_score(y_test,pred)*100)
print("i= ",np.argmax(score))
print(score[np.argmax(score)])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

i=  19
89.56576463184392


In [31]:
logr = LogisticRegression()
x = data["text"]
y = data["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model1 = logr.fit(x_train,y_train)
pred = model1.predict(x_test)
accuracy_score(y_test, pred)*100

89.56576463184392

In [32]:
# Training Data Accuracy

pred = model1.predict(x_train)
accuracy_score(y_train, pred)*100     #Generalized

92.65214928566932

In [33]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod1 = cross_val_score(LogisticRegression(), x_train, y_train, cv=10)
print(mod1)
print(np.round(np.mean(mod1),2)*100)

[0.89175582 0.88451857 0.88955318 0.89049717 0.89207048 0.88042794
 0.88105727 0.89395846 0.88857413 0.88668555]
89.0


**Decision Tree Classifier**

In [34]:
dtc = DecisionTreeClassifier(random_state=4)
model = dtc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

71.16425424795469

In [35]:
dtc = DecisionTreeClassifier(random_state=4)
x = data["text"]
y = data["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=19)

model2 = dtc.fit(x_train,y_train)
pred = model2.predict(x_test)
accuracy_score(y_test, pred)*100

71.16425424795469

In [36]:
# Training Data Accuracy

pred = model2.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

100.0

In [37]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod2 = cross_val_score(DecisionTreeClassifier(random_state=4), x_train, y_train, cv=5)
print(mod2)
print(np.round(np.mean(mod2),2)*100)

[0.71585903 0.70232851 0.70736312 0.71738788 0.7202203 ]
71.0


**RandomForest Classifier**

In [38]:
rfc = RandomForestClassifier(random_state=4)
model = rfc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

86.1296412838263

In [39]:
rfc = RandomForestClassifier()
x = data["text"]
y = data["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=19)

model3 = rfc.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test, pred)*100

85.43738200125865

In [40]:
# Training Data Accuracy

pred = model3.predict(x_train)
accuracy_score(y_train, pred)*100     #Overfitting

100.0

In [41]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod3 = cross_val_score(RandomForestClassifier(random_state=4), x_train, y_train, cv=3)
print(mod3)
print(np.round(np.mean(mod3),2)*100)

[0.84319834 0.83895025 0.84072885]
84.0


**Support Vector Classifier**

In [43]:
svc = SVC(kernel="linear",tol=0.1)
model = svc.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test, pred)*100

89.70421648835746

In [45]:
svc = SVC(kernel="linear",tol=0.1)
x = data["text"]
y = data["label"]
x = tfidf.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=np.argmax(score))

model4 = svc.fit(x_train,y_train)
pred = model4.predict(x_test)
accuracy_score(y_test, pred)*100

89.70421648835746

In [46]:
# Training Data Accuracy

pred = model4.predict(x_train)
accuracy_score(y_train, pred)*100     #Underfitting

95.2766064572975

In [47]:
#CROSS VALIDATION TO CHECK WHETHER THE MODEL IS GENERALIZED OR NOT

mod4 = cross_val_score(SVC(kernel="linear",tol=0.1), x_train, y_train, cv=4)
print(mod4)
print(np.round(np.mean(mod4),2)*100)

[0.88999371 0.89011957 0.88381168 0.88783988]
89.0
