In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import nltk#stop words
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('imdb.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.shape

(500, 2)

In [6]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,263
positive,237


In [7]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
#map
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
train_data.shape

(400, 2)

In [13]:
test_data.shape

(100, 2)

In [14]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])

In [15]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [16]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', LinearSVC())
])
pipe.fit(train_data['review'], y_train)

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
y_pred = pipe.predict(test_data['review'])

# Calculate accuracy
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.89      0.84        46
           1       0.90      0.80      0.84        54

    accuracy                           0.84       100
   macro avg       0.84      0.84      0.84       100
weighted avg       0.85      0.84      0.84       100


Confusion Matrix:
[[41  5]
 [11 43]]


In [23]:
import pickle as pkl
pkl.dump(pipe,open('model.pkl','wb'))

In [25]:
#tokenizer
pkl.dump(tokenizer,open('tokenizer.pkl','wb'))

In [33]:
def predictive_system(review):
    prediction = pipe.predict([review])

    if prediction[0] == 1:
        print("Positive Review")
    else:
        print("Negative Review")

In [34]:
predictive_system("This is a good movie")

Positive Review


In [35]:
predictive_system("This is a BAD movie")

Negative Review
