In [1]:
import pandas as pd
import re
import nltk   ### nltk / spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from nltk.stem import PorterStemmer
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pickle


In [4]:
data = pd.read_csv('D:/AMIT/projects/New folder/amazon_alexa.tsv', sep='\t')
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [5]:
data.isna().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
data.duplicated().sum()

np.int64(715)

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data['rating'].value_counts()

rating
5    1756
4     345
1     141
3     113
2      79
Name: count, dtype: int64

In [11]:
feedback_counts = data['feedback'].value_counts()
feedback_percentages = data['feedback'].value_counts(normalize=True) * 100

print("Value counts:")
print(feedback_counts)
print("\nPercentages:")
print(feedback_percentages)


Value counts:
feedback
1    2214
0     220
Name: count, dtype: int64

Percentages:
feedback
1    90.96138
0     9.03862
Name: proportion, dtype: float64


In [12]:
data['verified_reviews'] = data['verified_reviews'].str.lower()

In [13]:
s = " Ahmed Gamal is am n't i 123124T%$#%$1231@#$24%^23"
cleaned_s = re.sub('[^a-zA-Z]', ' ', s)
print(cleaned_s)

 Ahmed Gamal is am n t i       T                  


In [14]:
def clean_text(text):
    text = re.sub('[^A-Za-z]', ' ', text)  
    return text

data['verified_reviews'] = data['verified_reviews'].fillna('').apply(clean_text)

In [16]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alprofesur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [17]:
def tokenize(text):
    return word_tokenize(text)

data['verified_reviews'] = data['verified_reviews'].apply(tokenize)

In [18]:
data['verified_reviews']

0                                        [love, my, echo]
1                                             [loved, it]
2       [sometimes, while, playing, a, game, you, can,...
3       [i, have, had, a, lot, of, fun, with, this, th...
4                                                 [music]
                              ...                        
2796    [i, do, love, these, things, i, have, them, ru...
2797    [only, complaint, i, have, is, that, the, soun...
2798                                               [good]
2799                     [nice, little, unit, no, issues]
2800    [the, echo, dot, was, easy, to, set, up, and, ...
Name: verified_reviews, Length: 2434, dtype: object

In [20]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

stop_words.remove('not')
stop_words.remove('no')

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]
data['verified_reviews'] = data['verified_reviews'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alprofesur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatizer_func(text):
    return [lemmatizer.lemmatize(word) for word in text]

data['verified_reviews'] = data['verified_reviews'].apply(lemmatizer_func)

In [22]:
data['verified_reviews']

0                                            [love, echo]
1                                                 [loved]
2       [sometimes, playing, game, answer, question, c...
3       [lot, fun, thing, yr, old, learns, dinosaur, c...
4                                                 [music]
                              ...                        
2796    [love, thing, running, entire, home, tv, light...
2797    [complaint, sound, quality, great, mostly, use...
2798                                               [good]
2799                      [nice, little, unit, no, issue]
2800    [echo, dot, easy, set, use, help, provide, mus...
Name: verified_reviews, Length: 2434, dtype: object

In [23]:
data['verified_reviews'] = data['verified_reviews'].apply(lambda x: ' '.join(x))
data['verified_reviews']

0                                               love echo
1                                                   loved
2       sometimes playing game answer question correct...
3       lot fun thing yr old learns dinosaur control l...
4                                                   music
                              ...                        
2796    love thing running entire home tv light thermo...
2797    complaint sound quality great mostly use comma...
2798                                                 good
2799                            nice little unit no issue
2800    echo dot easy set use help provide music etc s...
Name: verified_reviews, Length: 2434, dtype: object

In [24]:
bow = CountVectorizer()
bow_x = bow.fit_transform(data['verified_reviews'])

In [25]:
tf = TfidfVectorizer()
tf_x = tf.fit_transform(data['verified_reviews'])

In [26]:
sm = SMOTE()
x_sm, y_sm = sm.fit_resample(tf_x, data['feedback'])

In [27]:
y_sm.value_counts()

feedback
1    2214
0    2214
Name: count, dtype: int64

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm)

In [29]:
dt = DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=10)
dt.fit(x_train, y_train)

In [30]:
print(accuracy_score(y_train, dt.predict(x_train)))
print(accuracy_score(y_test, dt.predict(x_test)))

0.9373682625715146
0.8798554652213189


In [31]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [32]:
print(accuracy_score(y_train, lr.predict(x_train)))
print(accuracy_score(y_test, lr.predict(x_test)))

0.9515206263173743
0.912375790424571


In [33]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)

In [34]:
print(accuracy_score(y_train, svc.predict(x_train)))
print(accuracy_score(y_test, svc.predict(x_test)))

0.9921710328214394
0.982836495031617


In [35]:
from sklearn.model_selection import GridSearchCV
param = {'kernel':['linear', 'rbf', 'poly', 'sigmoid'], 'C':[1, 10, 100, 1000]}
grid = GridSearchCV(svc, param)
grid.fit(x_train, y_train)

In [36]:
grid.best_params_

{'C': 100, 'kernel': 'rbf'}

In [37]:
grid.best_score_

np.float64(0.9864480478304195)

In [38]:
e  = grid.best_estimator_

In [39]:
e.predict(x_test)

array([1, 0, 1, ..., 1, 1, 0], shape=(1107,))

In [41]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    text = tf.transform([text])
    return text
text = text_preprocessing("I donot like it ")

In [42]:
lr.predict(text)

array([1])

In [43]:
pickle.dump(lr, open('lr.pkl', 'wb'))
pickle.dump(tf, open('tf.pkl', 'wb'))
pickle.dump(dt, open('dt.pkl', 'wb'))
pickle.dump(svc, open('svc.pkl', 'wb'))
