## 15-04-2025

In [17]:
#Sentiment Analysis using Logistic Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
df=pd.read_csv(r"C:\Users\91805\Downloads\NLP\IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
29994,"I enjoyed the first ""Toxic Avenger,"" but the s...",negative
29995,"New York, I Love You finally makes it to our s...",positive
29996,This movie makes you wish imdb would let you v...,negative
29997,"Space Camp, which had the unfortunate luck to ...",negative


In [19]:
df['review']=df['review'].str.lower()

In [20]:
import re
def remove_html_tag(text):
    p=re.compile('<.*?>')
    return p.sub('',text)

In [21]:
df['review']=df['review'].apply(remove_html_tag)

In [22]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
29994,"i enjoyed the first ""toxic avenger,"" but the s...",negative
29995,"new york, i love you finally makes it to our s...",positive
29996,this movie makes you wish imdb would let you v...,negative
29997,"space camp, which had the unfortunate luck to ...",negative


In [23]:
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,roc_curve,confusion_matrix,classification_report

In [24]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91805\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
exclude=string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
df['review'] = df['review'].apply(preprocess)

In [27]:
df

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically theres family little boy jake thinks...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
29994,enjoyed first toxic avenger sequel didnt work ...,negative
29995,new york love finally makes shores 10 short st...,positive
29996,movie makes wish imdb would let vote zero one ...,negative
29997,space camp unfortunate luck planned around tim...,negative


In [28]:
df['label']=df['sentiment'].map({'positive':1,'negative':0})

In [29]:
df

Unnamed: 0,review,sentiment,label
0,one reviewers mentioned watching 1 oz episode ...,positive,1
1,wonderful little production filming technique ...,positive,1
2,thought wonderful way spend time hot summer we...,positive,1
3,basically theres family little boy jake thinks...,negative,0
4,petter matteis love time money visually stunni...,positive,1
...,...,...,...
29994,enjoyed first toxic avenger sequel didnt work ...,negative,0
29995,new york love finally makes shores 10 short st...,positive,1
29996,movie makes wish imdb would let vote zero one ...,negative,0
29997,space camp unfortunate luck planned around tim...,negative,0


In [31]:
x_train,x_test,y_train,y_test=train_test_split(df['sentiment'],df['label'],test_size=0.2,random_state=42)

In [32]:
vector=TfidfVectorizer(max_features=5000)

In [33]:
x_train_vec=vector.fit_transform(x_train)

In [34]:
x_test_vec=vector.transform(x_test)

In [36]:
model=LogisticRegression(max_iter=1000)

In [37]:
model.fit(x_train_vec,y_train)

In [38]:
y_pred=model.predict(x_test_vec)
y_pred

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [40]:
y_prob=model.predict_proba(x_test_vec)[:,1]
y_prob

array([9.99386075e-01, 6.18082709e-04, 6.18082709e-04, ...,
       6.18082709e-04, 9.99386075e-01, 9.99386075e-01])

In [42]:
cr=classification_report(y_test,y_pred)
cr

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00      3025\n           1       1.00      1.00      1.00      2975\n\n    accuracy                           1.00      6000\n   macro avg       1.00      1.00      1.00      6000\nweighted avg       1.00      1.00      1.00      6000\n'

In [43]:
auc=roc_auc_score(y_test,y_prob)
auc

1.0

In [None]:
fpr,tpr,_=roc_curve(y_test,y_prob)
plt.figure(figsize=(8,6))
plt.plot(fpr,tpr,label=f'AUC={auc:.2f}',color='blue')
plt.plot([0,1],[1,0],'k--')
plt.xlabel('False Positive R')