---
## *Text Classification*
---

In [12]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Omar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


---

In [13]:
df=pd.read_csv('Data_Set\IMDB_Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
df.shape

(50000, 2)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [16]:
# Select only 10000 rows tmo save time
df=df.iloc[:10000]
df.shape

(10000, 2)

---
*Data Cleaning :*

In [17]:
print('isnull : ' ,df.isnull().sum().sum())
print('duplicated :' , df.duplicated().sum())

isnull :  0
duplicated : 17


In [18]:
df=df.drop_duplicates()
print('duplicated :' , df.duplicated().sum())

duplicated : 0


---
*Basic Preprocessing :* 
* *Remove tags - HTML*
* *Lower case*  
* *remove stopwords*  

In [19]:
## Remove html_tags
import re
def remove_html_tag(text):
    cleaned_text=re.compile('<.*?>')
    return cleaned_text.sub(r'',text)

df["review"]=df["review"].apply(remove_html_tag)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
## Lower case
df['review'] = df['review'].apply(lambda x:x.lower())
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [21]:
## remove stopwords
sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


---

In [24]:
# check data balanced
df["sentiment"].value_counts()

sentiment
positive    5023
negative    4960
Name: count, dtype: int64

In [25]:
# Define X,y
X=df["review"]
y=df["sentiment"]

In [26]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [28]:
X_train.shape,X_test.shape

((7986,), (1997,))

---
* *Applying Bag Of words - BoW :*

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

In [68]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)
y_pred = gnb.predict(X_test_bow)
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score
print('accuracy_score', accuracy_score(y_test,y_pred))
print('f1_score', f1_score(y_test,y_pred))

accuracy_score 0.6324486730095142
f1_score 0.5980284775465499


In [69]:
confusion_matrix(y_test,y_pred)

array([[717, 235],
       [499, 546]], dtype=int64)

In [70]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print('accuracy_score', accuracy_score(y_test,y_pred))
print('f1_score', f1_score(y_test,y_pred))

accuracy_score 0.8462694041061593
f1_score 0.8530397319291527


In [72]:
confusion_matrix(y_test,y_pred)

array([[799, 153],
       [154, 891]], dtype=int64)

---
* *Applying N_grams - BoW-N_grams :*

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2),max_features=3000)
X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
print('accuracy_score', accuracy_score(y_test,y_pred))
print('f1_score', f1_score(y_test,y_pred))

accuracy_score 0.8412618928392589
f1_score 0.8475228475228475


In [74]:
confusion_matrix(y_test,y_pred)

array([[799, 153],
       [164, 881]], dtype=int64)

---
* *Applying Applying TFIDF :*

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
TFIDF = TfidfVectorizer()
X_train_TFIDF = TFIDF.fit_transform(X_train).toarray()
X_test_TFIDF = TFIDF.transform(X_test).toarray()
rf = RandomForestClassifier()
rf.fit(X_train_TFIDF,y_train)
y_pred = rf.predict(X_test_TFIDF)
print('accuracy_score', accuracy_score(y_test,y_pred))
print('f1_score', f1_score(y_test,y_pred))

accuracy_score 0.8492739108662994
f1_score 0.8525232729054386


In [34]:
confusion_matrix(y_test,y_pred)

array([[826, 126],
       [175, 870]], dtype=int64)

---