### Import the Required Packages

In [206]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk.download('stopwords')
nltk.download('vader_lexicon')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mande\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mande\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Reading the dataframe

In [157]:
amazon_df = pd.read_csv("amazon_reviews.csv")

In [158]:
amazon_df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [159]:
amazon_df.shape

(4915, 12)

In [160]:
round((amazon_df.isna().sum()/amazon_df.shape[0])*100,3)

Unnamed: 0              0.00
reviewerName            0.02
overall                 0.00
reviewText              0.02
reviewTime              0.00
day_diff                0.00
helpful_yes             0.00
helpful_no              0.00
total_vote              0.00
score_pos_neg_diff      0.00
score_average_rating    0.00
wilson_lower_bound      0.00
dtype: float64

In [161]:
amazon_df.dropna(inplace = True)

In [162]:
amazon_df.shape

(4913, 12)

### Text Preprocessing

In [163]:
def preprocess_reviews(df):
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
    df['reviewText'] = df['reviewText'].str.replace('[^\w\s]', '')
    df['reviewText'] = df['reviewText'].str.replace('\d', '')

    stop_words = set(stopwords.words("english"))
    stop_words.discard("no")
    stop_words.discard("not")
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))

    document = df['reviewText']
    all_words = [word for sentence in document for word in sentence.split()]
    count_word_frequency = FreqDist(all_words)
    common_words_list = [word for word, count in count_word_frequency.items() if count >= 1]
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x in common_words_list))

    lemmatizer = WordNetLemmatizer()
    df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(lemmatizer.lemmatize(x) for x in str(x).split()))

    return df

preprocessed_df = preprocess_reviews(amazon_df.copy())

In [164]:
preprocessed_df['reviewText'].head()

1    purchased device, worked advertised. never muc...
2    work expected. sprung higher capacity. think m...
3    think worked great.had diff. bran 64gb card we...
4    bought retail packaging, arrived legit, orange...
5    mini storage. anything else not supposed to. p...
Name: reviewText, dtype: object

### Sentiment Analyzer

In [170]:
sia = SentimentIntensityAnalyzer()
preprocessed_df['polarity_score'] = preprocessed_df['reviewText'].apply(lambda x : sia.polarity_scores(x)["compound"])
preprocessed_df['polarity_score'] = preprocessed_df['polarity_score'].apply(lambda x : 1 if x>=0 else 0)

### Selecting Revelant Features to build the model

In [171]:
data = preprocessed_df[['polarity_score','reviewText']]

In [172]:
data.head()

Unnamed: 0,polarity_score,reviewText
1,0,"purchased device, worked advertised. never muc..."
2,0,work expected. sprung higher capacity. think m...
3,1,think worked great.had diff. bran 64gb card we...
4,1,"bought retail packaging, arrived legit, orange..."
5,1,mini storage. anything else not supposed to. p...


In [173]:
data.isna().sum()

polarity_score    0
reviewText        0
dtype: int64

### Train Test Split

In [186]:
X_train, X_test, y_train, y_test = train_test_split(data["reviewText"], data["polarity_score"], random_state= 42)

### Word to Vectors

#### Bag of Words

In [197]:
def word_to_vectors_bow(X_train, X_test):
    count_vectorizer = CountVectorizer()
    X_train_count_vectorizer = count_vectorizer.fit_transform(X_train)
    X_test_count_vectorizer = count_vectorizer.transform(X_test)
    return X_train_count_vectorizer, X_test_count_vectorizer

#### TFIDF 

In [204]:
def word_to_vectors_tf_idf(X_train, X_test):
    tf_idf_word_vectorizer = TfidfVectorizer()
    X_train_tf_idf_word = tf_idf_word_vectorizer.fit_transform(X_train)
    X_test_tf_idf_word = tf_idf_word_vectorizer.transform(X_test)
    return X_train_tf_idf_word, X_test_tf_idf_word

In [210]:
def word_to_vectors_tf_idf_ngram(X_train, X_test):
    tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range= (2,3))
    X_train_tf_idf_ngram = tf_idf_ngram_vectorizer.fit_transform(X_train)
    X_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(X_test)
    return X_train_tf_idf_ngram, X_test_tf_idf_ngram

In [213]:
def word_to_vectors_tf_idf_char(X_train, X_test):
    tf_idf_char_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,3))
    X_train_tf_idf_char = tf_idf_char_vectorizer.fit_transform(X_train)
    X_test_tf_idf_char = tf_idf_char_vectorizer.transform(X_test)
    return X_train_tf_idf_char, X_test_tf_idf_char

###  Model Training

In [201]:
x_train_count_vectorizer, x_test_count_vectorizer = word_to_vectors_bow(X_train, X_test)
logistic_model = LogisticRegression()
logistic_model.fit(x_train_count_vectorizer, y_train)
y_predict = logistic_model.predict(x_test_count_vectorizer)
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.8681855166802278


In [205]:
X_train_tf_idf_word, X_test_tf_idf_word = word_to_vectors_tf_idf(X_train, X_test)
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tf_idf_word, y_train)
y_predict = logistic_model.predict(X_test_tf_idf_word)
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.8510984540276648


In [211]:
X_train_tf_idf_ngram, X_test_tf_idf_ngram = word_to_vectors_tf_idf_ngram(X_train, X_test)
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tf_idf_ngram, y_train)
y_predict = logistic_model.predict(X_test_tf_idf_ngram)
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.8218063466232709


In [214]:
X_train_tf_idf_char, X_test_tf_idf_char = word_to_vectors_tf_idf_char(X_train, X_test)
logistic_model = LogisticRegression()
logistic_model.fit(X_train_tf_idf_char, y_train)
y_predict = logistic_model.predict(X_test_tf_idf_char)
accuracy = accuracy_score(y_test, y_predict)
print("Accuracy:", accuracy)

Accuracy: 0.8421480878763222
