In [None]:
https://www.kaggle.com/rogate16/amazon-reviews-2018-full-dataset

In [1]:
import pandas as pd

df = pd.read_csv("./amazon_reviews.csv")

df.drop(df.iloc[:, 1:9], inplace = True, axis = 1)
df.drop(['userName', 'reviewTime','summary','vote'], axis = 1,inplace = True)
df.head()


Unnamed: 0,rating,reviewText
0,5.0,super smooth and yummy with crunchy bits
1,5.0,Perfect for kombucha
2,5.0,Finally a harness that fits my puppy. I really...
3,5.0,I LOVE THEM!! I bought them at Micheals our of...
4,5.0,"I love this pen! I love the shape of it, the f..."


In [2]:
import numpy as np
df_train, df_validate, df_test = np.split(df.sample(frac=1, random_state=43), 
                       [int(.6*len(df)), int(.8*len(df))])
                       
print(df.size)
print(df_train.size)
print(df_validate.size)
print(df_test.size)


1102318
661390
220464
220464


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer  
from nltk.stem import WordNetLemmatizer 

stemmer = PorterStemmer()

w_n_lemmatizer = WordNetLemmatizer() 

STOPWORDS = stopwords.words("english")

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    single_spaces = " ".join(sentence.split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

def remove_stopwords(sentence):
    return " ".join([word for word in sentence.split(" ") if not word in STOPWORDS])

def remove_punctuation(tokenized_sentence):
    return [stemmer.stem(w_n_lemmatizer.lemmatize(w)) for w in tokenized_sentence if w.isalpha()]

def preprocess(sentence):
    return remove_punctuation(tokenize_sentence(remove_stopwords(remove_html_tags(str(sentence)))))

print(preprocess(df_train["reviewText"].values[1]))


['great', 'varieti']


In [4]:
print(df_train)
print(len(df_train))

        rating                                         reviewText
139356     4.0                                    My dog likes it
521437     5.0                                      Great variety
199505     5.0  Use this every day for my 12+ year old dog.  H...
72805      5.0  Was definitely a win-win situation both dogs l...
308204     4.0                       Love the latte one the most.
...        ...                                                ...
476226     5.0  Well, here's the thing. If you're opening your...
444547     4.0                       Durable and holds up nicely.
450378     3.0                                         Nice phone
244791     4.0                                          cats love
32109      5.0  Yes, I love it along with Several other of the...

[330695 rows x 2 columns]
330695


In [5]:
def sentiment(value):
    if( value<3.0 ):
        return "negative"
    elif (value == 3.0):
        return "neutral"
    else:
        return "positive"

df_train["sentiment"] = [None] * len(df_train)
df_train["sentiment"] = df_train["rating"].apply(sentiment)
df_validate["sentiment"] = [None] * len(df_validate)
df_validate["sentiment"] = df_validate["rating"].apply(sentiment)
df_test["sentiment"] = [None] * len(df_test)
df_test["sentiment"] = df_test["rating"].apply(sentiment)

In [6]:
df_train["reviewText"] = df_train["reviewText"].apply(preprocess)
df_train.head()

Unnamed: 0,rating,reviewText,sentiment
139356,4.0,"[my, dog, like]",positive
521437,5.0,"[great, varieti]",positive
199505,5.0,"[use, everi, day, year, old, dog, he, still, a...",positive
72805,5.0,"[wa, definit, situat, dog, love, play, togeth,...",positive
308204,4.0,"[love, latt, one, most]",positive


In [7]:
def build_vocab(tokenized_input, vocab_size):
    d = dict()

    for tokens in tokenized_input:
        for token in tokens:
            # double check
            if token not in STOPWORDS and token.isalpha():
                d[token] = d.get(token, 0) + 1



    return {k for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:vocab_size]}

In [8]:
VOCAB_SIZE = 25000
VOCAB = build_vocab(df_train["reviewText"], VOCAB_SIZE)

In [9]:
len(VOCAB)

25000

In [10]:
def get_frequencies_for_sentiment(df):
    dict_freqs = {"positive": {}, "negative": {},"neutral":{}}
    
    for idx in range(df.shape[0]):
        tokens = df_train.iloc[idx]["reviewText"]
        sentiment = df_train.iloc[idx]["sentiment"]
        
        for token in tokens:
            if token in VOCAB:
                dict_freqs[sentiment][token] = dict_freqs[sentiment].get(token, 0) + 1
            
    return dict_freqs

In [11]:
frequency_table = get_frequencies_for_sentiment(df_train)

In [12]:
frequency_table["positive"]["love"]
frequency_table["negative"]["love"]
frequency_table["neutral"]["love"]

2042

In [13]:
import numpy as np

def extract_features(frequency_table, tweet_tokens):
    positives = 0
    negatives = 0
    neutrals = 0

    for t in set(tweet_tokens):
        positives += frequency_table["positive"].get(t, 0)
        negatives += frequency_table["negative"].get(t, 0)
        neutrals += frequency_table["neutral"].get(t, 0)
    
    return pd.Series({"positives": positives, "negatives": negatives,"neutrals": neutrals})

In [14]:
X_train_logistic = df_train["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))

X_train_logistic

Unnamed: 0,positives,negatives,neutrals
139356,81698,14995,11014
521437,70469,2146,2512
199505,148429,24262,16158
72805,164257,22181,14577
308204,102419,9685,6769
...,...,...,...
476226,182032,32035,21128
444547,31598,2326,2632
450378,29833,2817,2622
244791,84871,5840,4559


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_logistic = scaler.fit_transform(X_train_logistic)

In [16]:
y_train_logistic = df_train["rating"].values

In [17]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train_logistic, y_train_logistic)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
from sklearn.metrics import accuracy_score

preds_train = clf.predict(X_train_logistic)

print("Train accuracy:", accuracy_score(y_train_logistic, preds_train))

Train accuracy: 0.6965632985076884


In [19]:
df_validate["reviewText"] = df_validate["reviewText"].apply(preprocess)
X_val_logistic = df_validate["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_val_logistic = scaler.transform(X_val_logistic)
y_val_logistic = df_validate["rating"].values

In [20]:
preds_val = clf.predict(X_val_logistic)

print("Validation accuracy:", accuracy_score(y_val_logistic, preds_val))

Validation accuracy: 0.6939545685463386


In [21]:
df_test["reviewText"] = df_test["reviewText"].apply(preprocess)
X_test_logistic = df_test["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_test_logistic = scaler.transform(X_test_logistic)
y_test_logistic = df_test["rating"].values

In [22]:
preds_test = clf.predict(X_test_logistic)

print("Test accuracy:", accuracy_score(y_test_logistic, preds_test))

Test accuracy: 0.6942267218230641
