In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv('amazon_data.txt', sep='\t', header=None)

In [3]:
data.columns = ["Review", "Sentiment"]
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
data["Sentiment"].value_counts() # checking data

1    500
0    500
Name: Sentiment, dtype: int64

In [5]:
data.isnull().sum() # checking for null values

Review       0
Sentiment    0
dtype: int64

In [6]:
## helpers for data cleaning
punctuation = string.punctuation
stop_words = list(STOP_WORDS)
nlp = spacy.load("en_core_web_sm")
numbers = string.digits
# print(stop_words)
## function that cleans input text
def cleaning_function(input_text):
    text = nlp(input_text)
    tokens = []
    for token in text:
        temp = token.lemma_.lower()
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words and token not in punctuation and token not in numbers:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [7]:
## test
cleaned_text = cleaning_function(data["Review"][4])
print(cleaned_text)

['mic', 'great']


In [8]:
X = data["Review"]
y = data["Sentiment"]

In [9]:
## SVC using tfidf (bag of words)

tfidf = TfidfVectorizer(tokenizer = cleaning_function)
classifier = LinearSVC()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=50)
SVC_clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
SVC_clf.fit(X_train, y_train)
y_pred = SVC_clf.predict(X_test)

print("Classification Report for SVC:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix for SVC:")
print(confusion_matrix(y_test, y_pred))

Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79       154
           1       0.79      0.72      0.75       146

    accuracy                           0.77       300
   macro avg       0.77      0.77      0.77       300
weighted avg       0.77      0.77      0.77       300

Confusion Matrix for SVC:
[[126  28]
 [ 41 105]]


In [10]:
def predict_func(string):
    temp = SVC_clf.predict([string])
    if (temp[0]==0):
        return("Negative Review")
    else:
        return("Positive Review")

In [11]:
user_input_text = input("Enter review: ")
user_input_pred = predict_func(user_input_text)
print("-->",user_input_pred)

Enter review: hi
--> Negative Review


In [12]:
## Add feature: input URL by user and output how many positive and negative comments

In [13]:
from selenium import webdriver
from pathlib import Path 
from bs4 import BeautifulSoup

DRIVER_PATH= str(Path('chromedriver').resolve())

user_input_URL = input("Enter Amazon URL: ")

print(user_input_URL)

# https://www.amazon.com/Acer-SB220Q-Ultra-Thin-Frame-Monitor/dp/B07CVL2D2S/ref=lp_16225007011_1_10?th=1

Enter Amazon URL: https://www.amazon.com/Acer-SB220Q-Ultra-Thin-Frame-Monitor/dp/B07CVL2D2S/ref=lp_16225007011_1_10?th=1
https://www.amazon.com/Acer-SB220Q-Ultra-Thin-Frame-Monitor/dp/B07CVL2D2S/ref=lp_16225007011_1_10?th=1


In [14]:
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
driver.get(user_input_URL)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
comments = soup.find_all('div', {'data-hook':'review-collapsed'})

In [15]:
# print(comments[0].span.text)
# for card in cards:
#     print (card)

comment_arr = []

for comment in comments:
    comment_arr.append(comment.span.text)

In [16]:
print(comment_arr[0])


  This monitor is definitely a good value. Does it have superb color and contrast? No. Does it boast the best refresh rate on the market? No. But if you're tight on money, this thing looks and preforms great for the money. It has a Matte screen which does a great job at eliminating glare. The chassis it's enclosed within is absolutely stunning. It features a VGA and and HDMI port. The screen automatically detects signals and turns on when a source is turned on. When the selected source turns off it quickly scans and switches to any other sources. If no connections are available it goes into standby automatically. The power chord is a good 6 or more feet and easy to wrap with half of it being very thin wire. The ON LED is not distracting or too bright. When it's off it looks like a borderless monitor. And unlike what other people say it does have tilt, just not forward or left to right. It stands up straight or leans back. Unfortunately it doesn't have VESA mount holes, but under 100 y

In [18]:
## PREDICT

pred_arr = []

for comment in comment_arr:
    comment_pred = predict_func(comment)
    pred_arr.append(comment_pred)
    
print(pred_arr)

['Positive Review', 'Positive Review', 'Positive Review', 'Negative Review', 'Positive Review', 'Negative Review', 'Positive Review', 'Positive Review', 'Negative Review', 'Positive Review', 'Positive Review', 'Positive Review']

  I bought two of these and enjoy them quite a bit. A few reviews complain about ghosting, which I experienced until I realized in the default settings on the monitors had "overdrive" or "OD" on. As long as you disable this feature you will no longer have ghosting trouble! Hope this helps anyone who bought these and was disappointed, I saw nothing online about fixing this issue other than figuring it out myself by chance.



In [21]:
print(comment_arr[10])


  Compré este monitor por el simple hecho de que era un 1080p 75hz de Acer. Me sorprendió que estas características estuviese a sólo 1,700mxn. No esperaba mucho de él, pero en estos tres meses me dio una gran sorpresa.El monitor es increíblemente liviano y delgado. La fuente de poder es externa, lo que ahorra costos y ayuda a su estética general.Cumple con todo, el full HD, los 75hz,  y es IPS ( no he probado el FreeSYNC, no uso AMD)Pros:+Es muy ligero, facilita mucho la limpieza del escritorio y el transporte.+Costo/beneficio, es de los mejores que puedes encontrar, superando a monitores 1080p60hz+Tiene varios perfiles personalizables y puedes alternar entre ellos fácilmente.+Tiene unos 30 grados de inclinación para mejor visualizaciónCons:-La base es de un plástico ligero y barato. Aunque para el peso del monitor, no necesita más.-Por default tiene un filtro verde muy molesto. Pero Se arregla en 10 minutos corrigiendo el color.-Los botones los tiene en la parte inferior, Cuesta algo