In [1]:
import pandas as pd
import numpy as np

In [2]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [3]:
train.shape

(40000, 2)

In [4]:
train.isnull().mean()*100

text     0.0
label    0.0
dtype: float64

In [5]:
train.duplicated().sum()

277

In [6]:
train[train["text"].duplicated()].head()

Unnamed: 0,text,label
2164,"I find it rather useless to comment on this ""m...",0
3236,An absolutely atrocious adaptation of the wond...,0
3428,When I first saw this film it was not an impre...,0
3652,Before I watched this tv movie I did not know ...,1
3787,What a clunker!<br /><br />It MUST have been m...,0


In [7]:
train.drop_duplicates(inplace=True)

In [8]:
train.shape

(39723, 2)

In [9]:
train["label"].value_counts()

1    19908
0    19815
Name: label, dtype: int64

# TEXT PREPROCESSING

In [10]:
import re
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [11]:
train["text"][0]

'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.'

In [12]:
# REMOVING HTML TAGS

def remove_html_tags(data):
    pattern = re.compile("<.*?>")
    return pattern.sub(r'',data)

train["text"] = train["text"].apply(remove_html_tags)

In [13]:
# CONVERTING TO LOWERCASE

train["text"] = train["text"].apply(lambda x:x.lower())

In [14]:
# TOKENIZATION

#nltk.download('punkt')
def tokens(data):
    token = word_tokenize(data)
    return token

train["text"] = train["text"].apply(tokens)

In [15]:
train.head()

Unnamed: 0,text,label
0,"[i, grew, up, (, b, ., 1965, ), watching, and,...",0
1,"[when, i, put, this, movie, in, my, dvd, playe...",0
2,"[why, do, people, who, do, not, know, what, a,...",0
3,"[even, though, i, have, great, interest, in, b...",0
4,"[im, a, die, hard, dads, army, fan, and, nothi...",1


In [16]:
# REMOVING PUNCTUATIONS

punctuations = string.punctuation

def remove_punc(data):
    without_punc = []
    for i in data:
        if i not in punctuations:
            without_punc.append(i)
        
    return without_punc

train["text"] = train["text"].apply(remove_punc)

In [17]:
# REMOVING STOPWORDS

#nltk.download('stopwords')
stopwords = stopwords.words("english")

def remove_stopwords(data):
    no_stopwords = []
    for i in data:
        if i not in stopwords:
            no_stopwords.append(i)
    return no_stopwords

train["text"] = train["text"].apply(remove_stopwords)

In [18]:
train.head()

Unnamed: 0,text,label
0,"[grew, b, 1965, watching, loving, thunderbirds...",0
1,"[put, movie, dvd, player, sat, coke, chips, ex...",0
2,"[people, know, particular, time, past, like, f...",0
3,"[even, though, great, interest, biblical, movi...",0
4,"[im, die, hard, dads, army, fan, nothing, ever...",1


In [19]:
# LEMMETIZATION

#nltk.download('wordnet')
leme = WordNetLemmatizer()
#pst = PorterStemmer()

def lemet(data):
    lem = []
    for i in data:
        lem.append(leme.lemmatize(i))
    return " ".join(lem)
    

train["text"] = train["text"].apply(lemet)

In [20]:
train.head()

Unnamed: 0,text,label
0,grew b 1965 watching loving thunderbird mate s...,0
1,put movie dvd player sat coke chip expectation...,0
2,people know particular time past like feel nee...,0
3,even though great interest biblical movie bore...,0
4,im die hard dad army fan nothing ever change g...,1


In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39723 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    39723 non-null  object
 1   label   39723 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [22]:
train.isna().sum()

text     0
label    0
dtype: int64

In [23]:
#train.to_csv("Cleaned_Review")
import pandas as pd
import numpy as np
train_c = pd.read_csv("Cleaned_Review")

# MODEL & TEXT VECTORIZATION

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score

In [25]:
x = train_c["text"]
y = train_c["label"]

In [26]:
tfid = TfidfVectorizer()
x = tfid.fit_transform(x)

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=25)
model = RandomForestClassifier()
model.fit(x_train,y_train)
pred = model.predict(x_test)
accuracy_score(y_test,pred)*100

85.92825676526117

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=None)
model3 = SVC(kernel="linear",tol=0.1)
model3.fit(x_train,y_train)
pred = model3.predict(x_test)
accuracy_score(y_test,pred)*100

89.66645689112649