Sentiment analysis NLP Amazon reviews

In [1]:

import pandas as pd
import numpy as np

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from gensim.models import Word2Vec


In [2]:
import nltk
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\michael\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def preprocess(docs):
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer("english")
    preprocessed = []

    for doc in docs:
        tokenized = word_tokenize(doc)

        cleaned = [
            stemmer.stem(lemmatizer.lemmatize(token.lower()))
            for token in tokenized
            if not token.lower() in stopwords.words("english")
            if token.isalpha()
        ]

        untokenized = " ".join(cleaned)
        preprocessed.append(untokenized)

    return preprocessed


from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def remove_stopwords(input_text):
    stopwords_list = stopwords.words("english")
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split()
    clean_words = [
        word
        for word in words
        if (word not in stopwords_list or word in whitelist) and len(word) > 1
    ]
    return " ".join(clean_words)


def stem_list(word_list):
    stemmed = []
    for word in word_list:
        stemmedword = stemmer.stem(word)
        stemmed.append(stemmedword)
    return stemmed


def normalize(terms):
    terms = terms.lower()
    terms = remove_stopwords(terms)
    word_delimiters = u"[\\[\\]\n.!?,;:\t\\-\\\"\\(\\)\\'\u2019\u2013 ]"
    term_list = re.split(word_delimiters, terms)
    trimmed = [x.rstrip() for x in term_list]
    stemmed = stem_list(trimmed)
    space = " "
    normed = space.join(stemmed)
    normed = normed.replace("  ", " ")
    return normed

In [4]:

am_rev = pd.read_csv(
    "Reviews.csv"
)

In [5]:
am_rev.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


Clean and preliminary analysis of data

In [6]:
am_rev.isna().mean()

Id                        0.000000
ProductId                 0.000000
UserId                    0.000000
ProfileName               0.000028
HelpfulnessNumerator      0.000000
HelpfulnessDenominator    0.000000
Score                     0.000000
Time                      0.000000
Summary                   0.000047
Text                      0.000000
dtype: float64

In [7]:
am_rev = am_rev.dropna()

In [8]:
am_rev.shape

(568411, 10)

In [9]:
drop_cols = ["Id", "ProductId", "UserId", "ProfileName", "Time"]
am_rev = am_rev.drop(columns=drop_cols)

In [10]:

am_rev["normalized"] = am_rev["Text"].apply(normalize)

In [11]:
am_rev.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,normalized
0,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut the pea...
2,1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citru gel...
3,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found it...
4,0,0,5,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...


In [12]:

# score of 1-2 will be negative,
# score of 3 will ne neutral
# score of 4-5 will be positive
# Delete rows containing a score of 3 since only predicting positive or negative.

am_rev = am_rev[am_rev["Score"] != 3]

In [13]:
# change score column to 1 for positive and 0 for negative
# delete original Score column

am_rev.loc[am_rev["Score"] <= 2, "Sentiment"] = 0
am_rev.loc[am_rev["Score"] > 2, "Sentiment"] = 1
am_rev = am_rev.drop(columns=["Score"])
am_rev.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Summary,Text,normalized,Sentiment
0,1,1,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...,1.0
1,0,0,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut the pea...,0.0
2,1,1,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citru gel...,1.0
3,3,3,Cough Medicine,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found it...,0.0
4,0,0,Great taffy,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...,1.0


Converting text features into numerical form

In [14]:
#test train
X = am_rev[["Summary", "Text", "normalized"]]
y = am_rev["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=85
)

In [15]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(am_rev)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
model = Pipeline([("vect", CountVectorizer()), ("clf", LogisticRegression())])

(394329, 3) (131444, 3) (394329,) (131444,)


In [16]:
X_train = vectorizer.fit_transform(X_train["Summary"])
X_test = vectorizer.transform(X_test["Summary"])

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_prediction = lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
lr.score(X_train, y_train)

0.9314506414694329

In [18]:
lr.score(X_test, y_test)


0.9254435348893826

good scores orverall. train is verry close to test so not over fitting

Converting text features into numerical form and using a dif vectorizer

In [19]:

X = am_rev[["Summary", "Text", "normalized"]]
y = am_rev["Sentiment"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=85
)

In [20]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(am_rev)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
model = Pipeline([("vect", CountVectorizer()), ("clf", LogisticRegression())])

(394329, 3) (131444, 3) (394329,) (131444,)


In [21]:
X_train = vectorizer.fit_transform(X_train["Summary"])
X_test = vectorizer.transform(X_test["Summary"])

In [22]:
tfid_lr = LogisticRegression()
tfid_lr.fit(X_train, y_train)
lr_prediction = tfid_lr.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
tfid_lr.score(X_train, y_train)

0.9307684699831866

In [24]:
tfid_lr.score(X_test, y_test)

0.9265694896686041

In [25]:
.92656948-0.92544353

0.0011259499999999312

good scores overall. train is verry close to test so not over fitting

 After preprocessing I dropped identifer columns. the goal is to predict whether the review is Positive or Negative. Scoring is between 1-5: A score of 1-2 will be negative, a score of 3 will ne neutral and deleted, and a score of 4-5 will be positive. I tried two diffent vectorizing methods with Logistic Regression. (Count vectorizer and  tfidf vectorizer). Count vectorizer had a train score of 93% and a test score of 93% and the tfidf vectorizer had a train score of 93% and test score of 93%. The train and test scores are similar due to the fact that both vectorizers perform silimar transformations. the tfidf vectoriver is .11% higher but thats not significant enough to say it did better