In [None]:
https://www.kaggle.com/rogate16/amazon-reviews-2018-full-dataset

In [199]:
import pandas as pd

df = pd.read_csv("./amazon_reviews.csv")

df.drop(df.iloc[:, 1:9], inplace = True, axis = 1)
df.drop(['userName', 'reviewTime','summary','vote'], axis = 1,inplace = True)
df.head()


Unnamed: 0,rating,reviewText
0,5.0,super smooth and yummy with crunchy bits
1,5.0,Perfect for kombucha
2,5.0,Finally a harness that fits my puppy. I really...
3,5.0,I LOVE THEM!! I bought them at Micheals our of...
4,5.0,"I love this pen! I love the shape of it, the f..."


In [236]:
import numpy as np
df_train, df_validate, df_test = np.split(df.sample(frac=0.4, random_state=200), 
                       [int(.2*len(df)), int(.3*len(df))])



In [237]:
print(df.size)
print(df_train.size)
print(df_validate.size)
print(df_test.size)


1102318
220462
110232
110234


In [238]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer  

stemmer = PorterStemmer()

STOPWORDS = stopwords.words("english")

def is_html_tag(word):
    w = word.replace("\n", "")
    return w.startswith("<") or w.startswith(">") or w.startswith("/") or w.strip()[:2] == "br"

def remove_html_tags(sentence):
    single_spaces = " ".join(sentence.split())

    return " ".join([token for token in single_spaces.split(" ") if not is_html_tag(token)])

def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

def remove_stopwords(sentence):
    return " ".join([word for word in sentence.split(" ") if not word in STOPWORDS])

def remove_punctuation(tokenized_sentence):
    return [stemmer.stem(w_n_lemmatizer.lemmatize(w)) for w in tokenized_sentence if w.isalpha()]

def preprocess(sentence):
    return remove_punctuation(tokenize_sentence(remove_stopwords(remove_html_tags(str(sentence)))))

print(preprocess(df_train["reviewText"].values[1]))


['my', 'dog', 'love', 'it', 'smell', 'aw']


In [239]:
print(df_train)
print(len(df_train))

        rating                                         reviewText
517892     5.0  Great adapter to convert your small tanks to y...
299317     4.0              My dog loves it, but it smells awful.
262608     5.0  I was glad to find these refills online.  I lo...
148619     5.0  These worked fantastically. I'd never used thi...
488257     5.0                                              Yummy
...        ...                                                ...
416233     5.0  My cats love this food! Such a reasonable pric...
239020     5.0                           Dog unable to rip apart!
533511     5.0  I used this chord to,go,from my Bass peddle to...
8272       5.0  I've outsourced most of my lawn maintenance, b...
345317     3.0                      not enough, packets are small

[110231 rows x 2 columns]
110231


In [244]:
def sentiment(value):
    if( value<3.0 ):
        return "negative"
    elif (value == 3.0):
        return "neutral"
    else:
        return "positive"

df_train["sentiment"] = [None] * len(df_train)
df_train["sentiment"] = df_train["rating"].apply(sentiment)
df_validate["sentiment"] = [None] * len(df_validate)
df_validate["sentiment"] = df_validate["rating"].apply(sentiment)
df_test["sentiment"] = [None] * len(df_test)
df_test["sentiment"] = df_test["rating"].apply(sentiment)

In [241]:
df_train["reviewText"] = df_train["reviewText"].apply(preprocess)
df_train.head()

Unnamed: 0,rating,reviewText,sentiment
517892,5.0,"[great, adapt, convert, small, tank, big, one]",positive
299317,4.0,"[my, dog, love, it, smell, aw]",positive
262608,5.0,"[i, glad, find, refil, onlin, i, love, paper, ...",positive
148619,5.0,"[these, work, fantast, i, never, use, type, to...",positive
488257,5.0,[yummi],positive


In [245]:
def build_vocab(tokenized_input, vocab_size):
    d = dict()

    for tokens in tokenized_input:
        for token in tokens:
            # double check
            if token not in STOPWORDS and token.isalpha():
                d[token] = d.get(token, 0) + 1



    return {k for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)[:vocab_size]}

In [246]:
VOCAB_SIZE = 4000
VOCAB = build_vocab(df_train["reviewText"], VOCAB_SIZE)

In [247]:
len(VOCAB)

4000

In [248]:
def get_frequencies_for_sentiment(df):
    dict_freqs = {"positive": {}, "negative": {},"neutral":{}}
    
    for idx in range(df.shape[0]):
        tokens = df_train.iloc[idx]["reviewText"]
        sentiment = df_train.iloc[idx]["sentiment"]
        
        for token in tokens:
            if token in VOCAB:
                dict_freqs[sentiment][token] = dict_freqs[sentiment].get(token, 0) + 1
            
    return dict_freqs

In [249]:
frequency_table = get_frequencies_for_sentiment(df_train)

In [252]:
frequency_table["positive"]["love"]

21124

In [253]:
import numpy as np

def extract_features(frequency_table, tweet_tokens):
    positives = 0
    negatives = 0
    neutrals = 0

    for t in set(tweet_tokens):
        positives += frequency_table["positive"].get(t, 0)
        negatives += frequency_table["negative"].get(t, 0)
        neutrals += frequency_table["neutral"].get(t, 0)
    
    return pd.Series({"positives": positives, "negatives": negatives,"neutrals": neutrals})

In [254]:
X_train_logistic = df_train["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))

X_train_logistic

Unnamed: 0,positives,negatives,neutrals
517892,43357,4593,3572
299317,36134,3740,2463
262608,34790,2920,2460
148619,100284,15175,10514
488257,554,9,11
...,...,...,...
416233,46425,4184,3068
239020,13780,2797,1692
533511,33821,5283,3819
8272,288096,45760,34325


In [255]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_logistic = scaler.fit_transform(X_train_logistic)

In [256]:
y_train_logistic = df_train["rating"].values

In [257]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train_logistic, y_train_logistic)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [258]:
from sklearn.metrics import accuracy_score

preds_train = clf.predict(X_train_logistic)

print("Train accuracy:", accuracy_score(y_train_logistic, preds_train))

Train accuracy: 0.6959385290889133


In [259]:
df_validate["reviewText"] = df_validate["reviewText"].apply(preprocess)
X_val_logistic = df_validate["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_val_logistic = scaler.transform(X_val_logistic)
y_val_logistic = df_validate["rating"].values

In [260]:
preds_val = clf.predict(X_val_logistic)

print("Validation accuracy:", accuracy_score(y_val_logistic, preds_val))

Validation accuracy: 0.6964765222439945


In [261]:
df_test["reviewText"] = df_test["reviewText"].apply(preprocess)
X_test_logistic = df_test["reviewText"].apply(lambda tokens: extract_features(frequency_table, tokens))
X_test_logistic = scaler.transform(X_test_logistic)
y_test_logistic = df_test["rating"].values

In [262]:
preds_test = clf.predict(X_test_logistic)

print("Test accuracy:", accuracy_score(y_test_logistic, preds_test))

Test accuracy: 0.6944862746521037
