In [35]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nayz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import re
import string

In [2]:
reviews_df = pd.read_csv('IMDB_Dataset.csv')

In [3]:
reviews_df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
def clean_text(text: str):
    text = re.sub(r'<,*?>', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower() 
    return text

In [6]:
reviews_df['reviews_cleaned'] = reviews_df['review'].apply(clean_text)

In [7]:
reviews_df.head()

Unnamed: 0,review,sentiment,reviews_cleaned
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...


In [None]:
import nltk
nltk.download('punkt_tab')
# Tokenization
reviews_df['tokens'] = reviews_df['reviews_cleaned'].apply(word_tokenize)


[nltk_data] Downloading package punkt_tab to /Users/nayz/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [13]:
reviews_df.head()

Unnamed: 0,review,sentiment,reviews_cleaned,tokens
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon..."


In [14]:
# Removing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/nayz/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
stop_words = set(stopwords.words('english'))

In [17]:
reviews_df['tokens'] = reviews_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [19]:
reviews_df.head()

Unnamed: 0,review,sentiment,reviews_cleaned,tokens
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, family, little, boy, jake,..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, time, money, visually,..."


In [20]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer= WordNetLemmatizer()
reviews_df['tokens'] = reviews_df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to /Users/nayz/nltk_data...


In [21]:
reviews_df['tokens']

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, br, br, filmin...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there, family, little, boy, jake, ...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movie, high, art, f...
Name: tokens, Length: 50000, dtype: object

In [22]:
reviews_df['review_processed'] = reviews_df['tokens'].apply(lambda x: ' '.join(x))

In [23]:
reviews_df['review_processed']

0        one reviewer mentioned watching 1 oz episode y...
1        wonderful little production br br filming tech...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: review_processed, Length: 50000, dtype: object

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = reviews_df['review_processed']
y = reviews_df['sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

In [29]:
y_pred = model.predict(X_test_tfidf)

In [30]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [31]:

accuracy_score(y_test, y_pred)

0.8917

In [None]:
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      4926
    Positive       0.89      0.90      0.89      5074

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [34]:
print(confusion_matrix(y_test, y_pred))

[[4351  575]
 [ 508 4566]]
