In [27]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
nlp = spacy.load('en_core_web_sm')

In [20]:
df = pd.read_csv('./movies_sentiment_data (1).csv')
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [21]:
df.sentiment.value_counts()

sentiment
positive    9500
negative    9500
Name: count, dtype: int64

In [22]:
df['sentiment_num'] = df.sentiment.map({
    'positive':1,
    'negative':0
})
df.head()

Unnamed: 0,review,sentiment,sentiment_num
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [23]:
def PreProcess(txt):
    doc=nlp(txt)
    no_stop=[token.text for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(no_stop)

In [24]:
df['review'] = df['review'].apply(PreProcess)

In [25]:
df.head()

Unnamed: 0,review,sentiment,sentiment_num
0,saw Jake Gyllenhaal Jarhead 2005 little watchi...,positive,1
1,enjoyed movie story immensely seen original(19...,positive,1
2,hard time sitting single twist turn predictabl...,negative,0
3,hard imagine find short favorite seen shorts k...,negative,0
4,military drama like lot Tom Berenger playing m...,positive,1


In [28]:
x_train,x_test,y_train,y_test=train_test_split(df['review'],df['sentiment_num'],random_state=42,test_size=.2,stratify=df['sentiment_num'])

# Naive Bayes 👇👇

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

clf=Pipeline([
    ('cv',CountVectorizer(ngram_range=(1,2))),
    ('mnb',MultinomialNB()),
])

clf.fit(x_train,y_train)

In [31]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1900
           1       0.87      0.85      0.86      1900

    accuracy                           0.86      3800
   macro avg       0.86      0.86      0.86      3800
weighted avg       0.86      0.86      0.86      3800



In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

clf=Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('mnb',MultinomialNB()),
])

clf.fit(x_train,y_train)

In [33]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1900
           1       0.87      0.84      0.85      1900

    accuracy                           0.86      3800
   macro avg       0.86      0.86      0.86      3800
weighted avg       0.86      0.86      0.86      3800



# KNN 👇👇

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

clf=Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('mnb',KNeighborsClassifier(n_neighbors=5)),
])

clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.75      1900
           1       0.75      0.76      0.75      1900

    accuracy                           0.75      3800
   macro avg       0.75      0.75      0.75      3800
weighted avg       0.75      0.75      0.75      3800



# SVM 👇👇

In [39]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

clf=Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('svm',SVC(kernel='rbf')),
])

clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      1900
           1       0.87      0.91      0.89      1900

    accuracy                           0.89      3800
   macro avg       0.89      0.89      0.89      3800
weighted avg       0.89      0.89      0.89      3800

