In [1]:
import pickle

In [2]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize, regexp_tokenize, FreqDist
from nltk.stem import WordNetLemmatizer
import string
import re

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

In [33]:
def text_processing(user_input):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    review_text = regexp_tokenize(user_input, pattern)
    review_text = ' '.join(review_text)
    review_text = review_text.lower()
    
    stopwords_list = stopwords.words('english')
    stopwords_list += list(string.punctuation)
    stopwords_list += ['game', 'animal', 'crossing']
    review_text = [w for w in review_text if w not in stopwords_list]
    review_text = ' '.join(review_text)
    
    lemmatizer = WordNetLemmatizer()
    review_text = [lemmatizer.lemmatize(w) for w in review_text]
    review_text = ' '.join(review_text)
        
    return review_text

In [4]:
df = pd.read_csv('data/user_reviews.csv')
def sentiment_labels(row):
    if row['grade'] >= 8:
        val = 'positive'
    elif row['grade'] <= 4:
        val = 'negative'
    else:
        val = 'neutral'
    return val
df['sentiment'] = df.apply(sentiment_labels, axis=1)

In [5]:
df['sentiment']

0       negative
1        neutral
2       negative
3       negative
4       negative
          ...   
2994    negative
2995    negative
2996    negative
2997    negative
2998    negative
Name: sentiment, Length: 2999, dtype: object

In [7]:
X = df.text
y = df.sentiment

In [8]:
model = LogisticRegression(C=0.2, penalty='none')

tfidf_vectorizer = TfidfVectorizer()
smote = SMOTE(sampling_strategy='not majority')
    
pipeline = make_pipeline(tfidf_vectorizer, smote, model)
    
pipeline.fit(X, y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('smote', SMOTE(sampling_strategy='not majority')),
                ('logisticregression',
                 LogisticRegression(C=0.2, penalty='none'))])

In [9]:
file = open('model.pkl', 'wb')
pickle.dump(pipeline, file)
file.close()

In [10]:
file = open('model.pkl', 'rb')
loaded_model = pickle.load(file)

In [11]:
loaded_model.predict(X)

array(['negative', 'neutral', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [19]:
def clean_text(text):
    new_text = text.lower()
    return new_text

In [53]:
user_input = X[63]
user_input

'By far the best Animal Crossing game yet. Sure, one island per system is oldscool (it was there in every animal crossing game), but it should encourages beeing social and sharing one island with your family.'

In [55]:
cleaned_text = text_processing(user_input)
loaded_model.predict([cleaned_text])

array(['negative'], dtype=object)

In [50]:
cleaned_text = clean_text(user_input)
loaded_model.predict([cleaned_text])

NameError: name 'clean_text' is not defined