In [29]:
# import required libraries
import pandas as pd
import numpy as np
import re
import string
import contractions
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score

In [30]:
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [31]:
df.shape

(50000, 2)

In [32]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [33]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [34]:
def preprocess_text(text):
    """
    Preprocess movie review text:
    - Remove HTML tags
    - Convert to lowercase
    - Remove punctuation
    - Remove extra spaces
    - Optional: remove stopwords
    - Optional: lemmatization
    """

    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Lowercase
    text = text.lower()

    # 3. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 4. Remove numbers
    text = re.sub(r"\d+", "", text)

    # 5. Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    words = text.split()

    # 6. Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [w for w in words if w not in stop_words]

    # 7. Lemmatization (optional)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]

    return " ".join(words)


In [35]:
df['text']=df['review'].apply(preprocess_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [36]:
df['text']

0        one reviewer mentioned watching oz episode you...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: text, Length: 50000, dtype: object

In [37]:
vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1,2))  
X = vectorizer.fit_transform(df['review'])  # df['review'] = your reviews column
y = df['sentiment']  # target column

print("Shape of TF-IDF matrix:", X.shape)


Shape of TF-IDF matrix: (50000, 30000)


In [38]:
X.shape

(50000, 30000)

In [39]:
X

<50000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 11219979 stored elements in Compressed Sparse Row format>

In [40]:
feature_name=vectorizer.get_feature_names_out()
feature_name

array(['00', '000', '000 000', ..., 'zoom', 'zorro', 'zucco'],
      dtype=object)

In [41]:
le=LabelEncoder()
y=le.fit_transform(y)

In [42]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [43]:
Lr=LogisticRegression()

In [44]:
Lr.fit(X_train,y_train)

In [45]:
y_pred=Lr.predict(X_test)

In [46]:
accuracy_score(y_pred,y_test)

0.9091

In [47]:
scores = cross_val_score(Lr, X, y, cv=5, scoring="accuracy")

print("Cross-validation scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Deviation:", scores.std())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-validation scores: [0.9084 0.9048 0.901  0.9047 0.9039]
Mean Accuracy: 0.90456
Standard Deviation: 0.0023635566420121897


In [49]:
params = {"C": [0.01, 0.1, 1,2,6,9,10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000), params, cv=5, scoring="accuracy")
grid.fit(X, y)
print("Best Params:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


Best Params: {'C': 6}
Best CV Accuracy: 0.9099600000000001


In [50]:
import numpy as np

best_model = grid.best_estimator_  # your tuned model
feature_names = vectorizer.get_feature_names_out()
coef = best_model.coef_[0]

# Top 20 positive words
top_pos = np.argsort(coef)[-20:]
print("Top positive words:", feature_names[top_pos])

# Top 20 negative words
top_neg = np.argsort(coef)[:20]
print("Top negative words:", feature_names[top_neg])


Top positive words: ['perfectly' 'loved this' 'definitely worth' 'beautifully' 'gem'
 'incredible' 'loved' 'fun' 'refreshing' 'well worth' 'superb' 'enjoyable'
 'brilliant' 'today' 'hilarious' 'amazing' 'wonderful' 'perfect'
 'excellent' 'great']
Top negative words: ['worst' 'awful' 'boring' 'waste' 'bad' 'poor' 'terrible' 'the worst'
 'disappointment' 'horrible' 'disappointing' 'poorly' 'dull' 'worse'
 'not worth' 'nothing' 'lame' 'lacks' 'forgettable' 'ridiculous']


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000, C=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.91      0.91      4961
           1       0.91      0.92      0.91      5039

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

[[4492  469]
 [ 419 4620]]


In [52]:
# Split data (keep indices)
X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y, df.index, test_size=0.2, random_state=42
)

# Train model
model = LogisticRegression(max_iter=1000, C=grid.best_params_["C"])
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)

# Create misclassified DataFrame
df_misclassified = pd.DataFrame({
    'review': df.loc[test_idx, 'review'],   # use loc with original indices
    'true_label': y_test,
    'predicted_label': y_pred,
    'predicted_prob': np.max(y_prob, axis=1)
})

# Filter misclassified
df_misclassified = df_misclassified[df_misclassified['true_label'] != df_misclassified['predicted_label']]

# Sort by confidence
df_misclassified = df_misclassified.sort_values(by='predicted_prob', ascending=False)

# Show top 10
print(df_misclassified.head(10))


                                                  review  true_label  \
42989  'Major Payne' is a film about a major who make...           0   
20634  normally i'm not the sort to be scared by horr...           0   
22292  Simon Wests pg-13 thriller about a babysitter ...           1   
43564  I really liked this quirky movie. The characte...           0   
37447  I loved this movie when it first came out(but ...           0   
23747  I saw this movie when it first came to the the...           0   
18684  The major flaw with the film is its uninspired...           0   
18692  A wonderful television mini-series completely ...           0   
48415  Based on the comments made so far, everyone se...           0   
16880  Despite some moments in heavy rain, an encount...           0   

       predicted_label  predicted_prob  
42989                1        0.995300  
20634                1        0.993827  
22292                0        0.993531  
43564                1        0.982833  
37

In [25]:
df_misclassified.shape

(875, 4)

In [26]:
def predict_sentiment(review, model, vectorizer):
    # Preprocess review (same as training step)
    review_clean = preprocess_text(review)
    
    # Transform using the fitted TF-IDF vectorizer
    X_vec = vectorizer.transform([review_clean])
    
    # Predict sentiment
    pred = model.predict(X_vec)[0]
    prob = model.predict_proba(X_vec).max()
    
    # Map prediction to label
    sentiment = "Positive 😀" if pred == 1 else "Negative 😡"
    
    return {"review": review, "prediction": sentiment, "confidence": round(prob, 3)}


In [27]:
temp=input('enter movie review:')
predict_sentiment(temp,Lr,vectorizer)

enter movie review:good


{'review': 'good', 'prediction': 'Positive 😀', 'confidence': 0.974}