<a href="https://colab.research.google.com/github/mahdiSheykhiGithub/IMDB-Reviews-NLP/blob/main/IMDB_Tfidf_ML_classic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Load data

In [20]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB Reviews/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [22]:
data.shape

(50000, 2)

In [23]:
data['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

# Preprocessing

In [24]:
soup = BeautifulSoup(data['review'][1])
text = soup.get_text()
text

'A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well done.'

In [25]:
def clean_text(text):
    # html to text format
    soup = BeautifulSoup(text)
    text = soup.get_text()
    # lower case
    text = text.lower()
    # tokenize
    tokens = word_tokenize(text)
    # remove punctuation
    punc = list(string.punctuation)
    # stopword & stemmer
    stop_words = stopwords.words('english')
    stemmer = PorterStemmer()

    filtered = []
    for token in tokens:
        if (token not in stop_words) and (token not in punc):
            word = stemmer.stem(token)
            filtered.append(word)

    text = ' '.join(filtered)
    return text

In [26]:
data['review'] = data['review'].apply(clean_text)

  soup = BeautifulSoup(text)


In [27]:
vectorizer = TfidfVectorizer(min_df = 2)
X = vectorizer.fit_transform(data['review'])

In [28]:
features = vectorizer.get_feature_names_out()

In [29]:
len(features)

44871

In [30]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [31]:
label_enc = LabelEncoder()
y = label_enc.fit_transform(data['sentiment'])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modeling & evaluation

In [33]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = KNeighborsClassifier()

In [34]:
X_train

<35000x44871 sparse matrix of type '<class 'numpy.float64'>'
	with 3325632 stored elements in Compressed Sparse Row format>

In [36]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [38]:
vc = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('knn', clf3)],
    voting='hard'
)
vc.fit(X_train, y_train)

In [42]:
models = {'LogisticRegression' : clf1, 'RandomForest': clf2, 'KNeighborsClassifier' : clf3, 'VotingClassifier' : vc }
for name, model in models.items():
  score =  model.score(X_test, y_test)
  print(f'{name} - Accuracy : {score}')

LogisticRegression - Accuracy : 0.8916
RandomForest - Accuracy : 0.8408
KNeighborsClassifier - Accuracy : 0.7838
VotingClassifier - Accuracy : 0.889


In [46]:
y_pred = clf1.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      7411
    positive       0.88      0.90      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [48]:
print(confusion_matrix(y_test, y_pred))

[[6520  891]
 [ 735 6854]]
