In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from sklearn.svm import LinearSVC
from google_trans_new import google_translator
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Training data

In [2]:
data = pd.read_csv('amazon_data.txt', sep='\t', header=None)

In [3]:
data.columns = ["Review", "Sentiment"]
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
data["Sentiment"].value_counts() # checking data

1    500
0    500
Name: Sentiment, dtype: int64

In [5]:
data.isnull().sum() # checking for null values

Review       0
Sentiment    0
dtype: int64

In [6]:
## helpers for data cleaning
punctuation = string.punctuation
stop_words = list(STOP_WORDS)
nlp = spacy.load("en_core_web_sm")
numbers = string.digits
# print(stop_words)
## function that cleans input text
def cleaning_function(input_text):
    text = nlp(input_text)
    tokens = []
    for token in text:
        temp = token.lemma_.lower()
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words and token not in punctuation and token not in numbers:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
## test
cleaned_text = cleaning_function(data["Review"][4])
print(cleaned_text)

['mic', 'great']


In [8]:
X = data["Review"]
y = data["Sentiment"]

In [9]:
## SVC using tfidf (bag of words)

tfidf = TfidfVectorizer(tokenizer = cleaning_function)
classifier = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=50)
SVC_clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
SVC_clf.fit(X_train, y_train)
y_pred = SVC_clf.predict(X_test)

print("Classification Report for SVC:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix for SVC:")
print(confusion_matrix(y_test, y_pred))

Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79       154
           1       0.79      0.72      0.75       146

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300

Confusion Matrix for SVC:
[[126  28]
 [ 41 105]]


In [10]:
def predict_func(string):
    temp = SVC_clf.predict([string])
    if (temp[0]==0):
        return("Negative Review")
    else:
        return("Positive Review")

In [11]:
## For testing 
# user_input_text = input("Enter review: ")
# user_input_pred = predict_func(user_input_text)
# print("-->",user_input_pred)

# Text Scraping

In [12]:
## Add feature: input URL by user and output how many positive and negative comments

In [17]:
from selenium import webdriver
from pathlib import Path 
from bs4 import BeautifulSoup

DRIVER_PATH= str(Path('chromedriver').resolve())

user_input_URL = input("Enter Amazon URL: ")

# print(user_input_URL)

# https://www.amazon.com/Acer-SB220Q-Ultra-Thin-Frame-Monitor/dp/B07CVL2D2S/ref=lp_16225007011_1_10?th=1

Enter Amazon URL: https://www.amazon.com/Acer-SB220Q-Ultra-Thin-Frame-Monitor/dp/B07CVL2D2S/ref=lp_16225007011_1_10?th=1


In [18]:
translator = google_translator()
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(user_input_URL)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')

all_reviews_button = driver.find_elements_by_xpath('//*[@id="cr-pagination-footer-0"]/a')[0]
all_reviews_button.click()

all_pages_reached = False

comment_arr = []
ratings_arr = []

pages = 0
while pages < 15:
# while not all_pages_reached:
    pages += 1
    comments = soup.find_all('span', {'data-hook':'review-body'})
    ratings = soup.find_all('i', {'data-hook':'review-star-rating'})
    
    for comment in comments:
        comment_temp = comment.span.text
        time.sleep(0.2)
        
##         Translator gets blocked from Google's side if there is too many requests
#         if((translator.detect(comment_temp))[0]!='en'):
#             comment_temp = translator.translate(comment_temp, lang_tgt='en')
            
        comment_arr.append(comment_temp)
    
    for rating in ratings:
        rating_temp = float(rating.text[:3])
        ratings_arr.append(rating_temp)
    
    try:
        next_page_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="cm_cr-pagination_bar"]/ul/li[2]/a'))
        )
        next_page_button.click()
    except:
        print("Finished")
        all_pages_reached = True 
        break

In [19]:
## PREDICT

pred_arr = []
pos = 0
neg = 0

for comment in comment_arr:
    comment_pred = predict_func(comment)
    if comment_pred=='Positive Review':
        pos+=1
    else:
        neg+=1
    pred_arr.append(comment_pred)
    
# print(pred_arr)
print("Total number of positive reviews:", pos)
print("Total number of negative reviews:", neg)

# from scraping
print("Average rating from reviewers:", '%.2f' % (sum(ratings_arr)/len(ratings_arr)))

Total number of positive reviews: 135
Total number of negative reviews: 45
Average rating from reviewers: 4.29


In [20]:
driver.quit()