In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk   ### nltk / spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [None]:
data.isna().sum()

Unnamed: 0,0
rating,0
date,0
variation,0
verified_reviews,1
feedback,0


In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

715

In [None]:
### Classification
### check data imbalance
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,2286
4,455
1,161
3,152
2,95


In [None]:
data['feedback'].value_counts()

Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2893
0,256


In [None]:
#### 1.Lower case
data['verified_reviews'] = data['verified_reviews'].str.lower()

In [None]:
s = " Ahmed Adel is am n't i 123124T%$#%$12312423"
re.sub('[^a-zA-z]', ' ', s)

' Ahmed Adel is am n t i       T             '

In [None]:
#### 2.special charcter
def clean_text(text):
    text = re.sub('[^a-zA-z]', ' ', text)
    return text
data['verified_reviews'] = data['verified_reviews'].apply(clean_text)



In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
### 3.Tokinzation
def tokenize(text):
    return word_tokenize(text)
data['verified_reviews'] = data['verified_reviews'].apply(tokenize)

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,"[love, my, echo]"
1,"[loved, it]"
2,"[sometimes, while, playing, a, game, you, can,..."
3,"[i, have, had, a, lot, of, fun, with, this, th..."
4,[music]
...,...
3145,"[perfect, for, kids, adults, and, everyone, in..."
3146,"[listening, to, music, searching, locations, c..."
3147,"[i, do, love, these, things, i, have, them, ru..."
3148,"[only, complaint, i, have, is, that, the, soun..."


In [None]:
### 4.stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
### exclude not
stop_words.remove('not')
stop_words.remove('no')

def remove_stopwords(text):
    return [word for word in text if word not in stop_words] ### list comprehnsion
data['verified_reviews'] = data['verified_reviews'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
### 5.lemmetization
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stimizer(text):
    return [stemmer.stem(word) for word in text]
data['verified_reviews'] = data['verified_reviews'].apply(stimizer)

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,"[love, echo]"
1,[love]
2,"[sometim, play, game, answer, question, correc..."
3,"[lot, fun, thing, yr, old, learn, dinosaur, co..."
4,[music]
...,...
3145,"[perfect, kid, adult, everyon]"
3146,"[listen, music, search, locat, check, time, lo..."
3147,"[love, thing, run, entir, home, tv, light, the..."
3148,"[complaint, sound, qualiti, great, mostli, use..."


In [None]:
###
data['verified_reviews'] = data['verified_reviews'].apply(lambda x : ' '.join(x))

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,love echo
1,love
2,sometim play game answer question correctli al...
3,lot fun thing yr old learn dinosaur control li...
4,music
...,...
3145,perfect kid adult everyon
3146,listen music search locat check time look weat...
3147,love thing run entir home tv light thermostat ...
3148,complaint sound qualiti great mostli use comma...


In [None]:
bow = CountVectorizer()
bow_x = bow.fit_transform(data['verified_reviews'])

In [None]:
tf = TfidfVectorizer()
tf_x = tf.fit_transform(data['verified_reviews'])

In [None]:
### fix data imbalance
from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_sm, y_sm = sm.fit_resample(tf_x, data['feedback'])

In [None]:
y_sm.value_counts()


Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2893
0,2893


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm)

In [None]:
dt = DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=10)
dt.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, dt.predict(x_train)))
print(accuracy_score(y_test, dt.predict(x_test)))

0.9469923945609587
0.9129232895646164


In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, lr.predict(x_train)))
print(accuracy_score(y_test, lr.predict(x_test)))

0.9529845586540677
0.9364201796821009


In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, svc.predict(x_train)))
print(accuracy_score(y_test, svc.predict(x_test)))

0.9875547361143121
0.9889426399447132


In [None]:
#### Grid Search
from sklearn.model_selection import GridSearchCV
param = {'kernel':['linear', 'rbf', 'poly', 'sigmoid'], 'C':[1, 10, 100, 1000]}
grid = GridSearchCV(svc, param)
grid.fit(x_train, y_train)

In [None]:
grid.best_params_

{'C': 10, 'kernel': 'rbf'}

In [None]:
grid.best_score_

0.9836381611468117

In [None]:
e  = grid.best_estimator_

In [None]:
e.predict(x_test)

array([1, 1, 0, ..., 0, 1, 1])

In [None]:
### SVM , dt , lr
def text_preprocessing(text):
  ## lower case
  text = text.lower()
  ## special charcter
  text = re.sub('[^a-zA-z]', ' ', text)
  ## Tokinzation
  text = word_tokenize(text)
  ## stopwords
  text = [word for word in text if word not in stop_words]
  ## lemmetization
  text = [stemmer.stem(word) for word in text]
  text = ' '.join(text)
  text = tf.transform([text])
  return text

text = text_preprocessing("I donot like it ")
lr.predict(text)

array([1])

In [None]:
import pickle
pickle.dump(tf, open('tf.pkl', 'wb'))