In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://databootcamp-washu-reviews-project.s3.us-east-2.amazonaws.com/IMDB_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Naive Bayes Pipeline
#   - reviews are tokenized and stop words removed
#   - TF - IDF which weights words based on their frequency, both per review and in all reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
nb_clf = Pipeline([
    ('vect', CountVectorizer(stop_words="english")),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [5]:
nb_clf.fit(df["review"], df["sentiment"])

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [8]:
import numpy as np
docs_test = df["review"]
predicted = nb_clf.predict(docs_test)
np.mean(predicted == df["sentiment"])

0.90346

In [14]:
# Support Vector Machine Pipeline
#   - reviews are tokenized and stop words removed
#   - TF - IDF which weights words based on their frequency, both per review and in all reviews
from sklearn.linear_model import SGDClassifier
svm_clf = Pipeline([
     ('vect', CountVectorizer(stop_words='english')),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
])

In [15]:
svm_clf.fit(df["review"], df["sentiment"])

Pipeline(steps=[('vect', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('clf', SGDClassifier())])

In [16]:
predicted = svm_clf.predict(docs_test)
np.mean(predicted == df["sentiment"])

# svm_clf.score(df['review'], df['sentiment'])

0.92218

In [23]:
from joblib import dump, load
dump(svm_clf, 'models/svm_model.joblib') 

['models/svm_model.joblib']

In [18]:
def detect_sentiment(text):
    df = pd.DataFrame([text])
    prediction = svm_clf.predict(df[0])

    return prediction[0]


In [26]:
pd.__version__

'1.1.5'