In [1]:
import pickle

In [2]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk import word_tokenize, regexp_tokenize, FreqDist
from nltk.stem import WordNetLemmatizer
import string
import re

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

In [3]:
def text_processing(user_input):
    pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
    review_text = regexp_tokenize(user_input, pattern)
    review_text = ' '.join(review_text)
    review_text = review_text.lower()

    
    stopwords_list = stopwords.words('english')
    stopwords_list += list(string.punctuation)
    stopwords_list += ['game', 'animal', 'crossing']
    review_text = [w for w in review_text.split() if w not in stopwords_list]
    
    lemmatizer = WordNetLemmatizer()
    review_text = [lemmatizer.lemmatize(w) for w in review_text]
    review_text = ' '.join(review_text)
        
    return review_text

In [4]:
df = pd.read_csv('data/user_reviews.csv')
def sentiment_labels(row):
    if row['grade'] >= 8:
        val = 'positive'
    elif row['grade'] <= 4:
        val = 'negative'
    else:
        val = 'neutral'
    return val
df['sentiment'] = df.apply(sentiment_labels, axis=1)

In [5]:
df['sentiment']

0       negative
1        neutral
2       negative
3       negative
4       negative
          ...   
2994    negative
2995    negative
2996    negative
2997    negative
2998    negative
Name: sentiment, Length: 2999, dtype: object

In [6]:
X = df.text
y = df.sentiment

In [7]:
model_X  = X.apply(text_processing)
model_X

0       gf started playing option create island guy nd...
1       great really relaxing gorgeous can't ignore on...
2       wife looking forward playing released bought l...
3       need equal value opportunity player island wif...
4       beware multiple people house want play account...
                              ...                        
2994    island console limitation cannot play girlfrie...
2995    per giocare con figli fidanzate mogli persone ...
2996    one island per console pathetic limitation end...
2997    even though seems like great many item charact...
2998    fantastic nintendo deciding make one island pe...
Name: text, Length: 2999, dtype: object

In [8]:
model = LogisticRegression(C=0.2, penalty='none')

tfidf_vectorizer = TfidfVectorizer()
smote = SMOTE(sampling_strategy='not majority')
    
pipeline = make_pipeline(tfidf_vectorizer, smote, model)

model_X  = X.apply(text_processing)
    
pipeline.fit(model_X, y)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('smote', SMOTE(sampling_strategy='not majority')),
                ('logisticregression',
                 LogisticRegression(C=0.2, penalty='none'))])

In [9]:
file = open('acnh_review_model.pkl', 'wb')
pickle.dump(pipeline, file)
file.close()

In [10]:
file = open('acnh_review_model.pkl', 'rb')
loaded_model = pickle.load(file)

In [11]:
loaded_model.predict(X)

array(['negative', 'neutral', 'negative', ..., 'negative', 'positive',
       'negative'], dtype=object)

In [12]:
def clean_text(text):
    new_text = text.lower()
    return new_text

In [18]:
user_input = "I love this game so much!"
user_input

'I love this game so much!'

In [19]:
cleaned_text = text_processing(user_input)
loaded_model.predict([cleaned_text])

array(['positive'], dtype=object)

In [15]:
cleaned_text = clean_text(user_input)
loaded_model.predict([cleaned_text])

array(['positive'], dtype=object)