In [41]:
import pandas as pd
import numpy as np
import spacy
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [43]:
df=pd.read_csv(r"C:\Users\omgai\Downloads\amazon_alexa.tsv",sep='\t')
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [45]:
df.shape

(3150, 5)

In [47]:
df.rating.value_counts()

rating
5    2286
4     455
1     161
3     152
2      96
Name: count, dtype: int64

In [64]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
Sentiment           0
dtype: int64

In [70]:
df = df.dropna()

In [72]:
df.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
Sentiment           0
dtype: int64

In [74]:
df.dtypes

rating               int64
date                object
variation           object
verified_reviews    object
feedback             int64
Sentiment            int64
dtype: object

let's assumed that the 5 & 4 stars ratting are positive feedback.
3 for neutral &
1  &  2 stars are negative reviews.

now lets create the new column with the 0 for positive , 1 for neutral & 2 for negative reviews.

In [78]:
df['Sentiment']=df.rating.map({
     5:0,
    4:0,
    3:1,
    2:2,
    1:2  
})
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,Sentiment
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,0
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,0
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,0
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,0
4,5,31-Jul-18,Charcoal Fabric,Music,1,0


In [80]:
df.Sentiment.value_counts()

Sentiment
0    2741
2     256
1     152
Name: count, dtype: int64

In [82]:
min_count=df.Sentiment.value_counts().min()

see there is huge imbalance in the dataset to ovecome this problem lets use undersampling for this small learning project even it is not good practice to lose data

In [85]:
df_positive=df[df.Sentiment==0].sample(min_count)
df_positive.shape

(152, 6)

let's do it for remaining ones

In [88]:
df_neutral=df[df.Sentiment==1].sample(min_count)
df_negative=df[df.Sentiment==2].sample(min_count)

df=pd.concat([df_positive,df_neutral,df_negative])
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,Sentiment
2562,5,30-Jul-18,Black Dot,"Just like it's big sister, the Echo, only smal...",1,0
1560,5,30-Jul-18,Black Show,I love the Echo Show!! I have the dot and have...,1,0
3099,4,30-Jul-18,Black Dot,Good device,1,0
2593,5,30-Jul-18,Black Dot,Alexa is phenomenal!,1,0
1929,4,26-Jul-18,Black Plus,I love my Echo Plus. The only drawback is that...,1,0


In [90]:
df.Sentiment.value_counts()

Sentiment
0    152
1    152
2    152
Name: count, dtype: int64

#  Preprocessing 

in preprocessing we are going to use Spacy -> tokenizations, stop words , lemmatization. 

In [95]:
def preprocessing(text):
    nlp=spacy.load('en_core_web_sm')
    doc=nlp(text)

    words=[]
    for token in doc:
        if not token.is_stop and not token.is_punct:
            words.append(token.lemma_)

    return " ".join(words)

In [97]:
df['preprocessed_text']=df.verified_reviews.apply(preprocessing)
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,Sentiment,preprocessed_text
2562,5,30-Jul-18,Black Dot,"Just like it's big sister, the Echo, only smal...",1,0,like big sister Echo small slightly audio fide...
1560,5,30-Jul-18,Black Show,I love the Echo Show!! I have the dot and have...,1,0,love Echo dot 6 month well sound quality a...
3099,4,30-Jul-18,Black Dot,Good device,1,0,good device
2593,5,30-Jul-18,Black Dot,Alexa is phenomenal!,1,0,Alexa phenomenal
1929,4,26-Jul-18,Black Plus,I love my Echo Plus. The only drawback is that...,1,0,love Echo Plus drawback battery plug time make...


In [105]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_text, 
    df.Sentiment, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.Sentiment
)

In [109]:
clf=Pipeline([
    ('vectorizer',CountVectorizer(ngram_range=(1,3)) ),
    ('model', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.65      0.68        31
           1       0.80      0.52      0.63        31
           2       0.50      0.73      0.59        30

    accuracy                           0.63        92
   macro avg       0.67      0.63      0.63        92
weighted avg       0.67      0.63      0.63        92



In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer
clf=Pipeline([
    ('vectorizer',TfidfVectorizer() ),
    ('model', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.61      0.75        31
           1       0.74      0.65      0.69        31
           2       0.53      0.80      0.64        30

    accuracy                           0.68        92
   macro avg       0.74      0.69      0.69        92
weighted avg       0.74      0.68      0.69        92



In [119]:
from sklearn.neighbors import KNeighborsClassifier
clf=Pipeline([
    ('vectorizer',TfidfVectorizer() ),
    ('model', KNeighborsClassifier())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.16      0.27        31
           1       0.45      0.45      0.45        31
           2       0.42      0.77      0.54        30

    accuracy                           0.46        92
   macro avg       0.57      0.46      0.42        92
weighted avg       0.57      0.46      0.42        92



In [121]:
from sklearn.naive_bayes import MultinomialNB
clf=Pipeline([
    ('vectorizer',TfidfVectorizer() ),
    ('model', MultinomialNB())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.74      0.84        31
           1       0.68      0.61      0.64        31
           2       0.60      0.80      0.69        30

    accuracy                           0.72        92
   macro avg       0.75      0.72      0.72        92
weighted avg       0.75      0.72      0.72        92



In [123]:
from xgboost import XGBClassifier
clf=Pipeline([
    ('vectorizer',TfidfVectorizer() ),
    ('model', XGBClassifier())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.52      0.65        31
           1       0.70      0.61      0.66        31
           2       0.49      0.77      0.60        30

    accuracy                           0.63        92
   macro avg       0.69      0.63      0.64        92
weighted avg       0.70      0.63      0.64        92



so by trying various algos following works well so lets import this model

In [126]:
clf=Pipeline([
    ('vectorizer',TfidfVectorizer() ),
    ('model', RandomForestClassifier())
])
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.55      0.68        31
           1       0.72      0.58      0.64        31
           2       0.50      0.80      0.62        30

    accuracy                           0.64        92
   macro avg       0.70      0.64      0.65        92
weighted avg       0.71      0.64      0.65        92



In [128]:
import joblib

# Save to a file
joblib.dump(clf, "basic_alexa_review_model.pkl")

['basic_alexa_review_model.pkl']