### Importing the main toolkits

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings('ignore', category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'

%matplotlib inline

## 1- Reading the Dataset 

In [None]:
temp = pd.read_csv('1429_1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
temp.head(50)
temp['reviews.rating'].unique()

array([ 5.,  4.,  2.,  1.,  3., nan])

In [None]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs 

In [None]:
# create a new dataframe consist of only text and rating
df = pd.DataFrame()
df[['text', 'rating']] = temp[['reviews.text', 'reviews.rating']]
df.head()

Unnamed: 0,text,rating
0,This product so far has not disappointed. My c...,5.0
1,great for beginner or experienced person. Boug...,5.0
2,Inexpensive tablet for him to use and learn on...,5.0
3,I've had my Fire HD 8 two weeks now and I love...,4.0
4,I bought this for my grand daughter when she c...,5.0


In [None]:
# Investigate how many rows of have a Null values
df.isnull().sum()

text       1
rating    33
dtype: int64

In [None]:
# drop the rows with Null values 
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34626 entries, 0 to 34659
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    34626 non-null  object 
 1   rating  34626 non-null  float64
dtypes: float64(1), object(1)
memory usage: 811.5+ KB


In [None]:
df['label'] = df['rating'].apply(lambda x : 1 if x >= 4 else 0) 

# drop the unneeded column of ratings
df.drop(labels=['rating'], axis=1, inplace=True)

df.head()

Unnamed: 0,text,label
0,This product so far has not disappointed. My c...,1
1,great for beginner or experienced person. Boug...,1
2,Inexpensive tablet for him to use and learn on...,1
3,I've had my Fire HD 8 two weeks now and I love...,1
4,I bought this for my grand daughter when she c...,1


# 2- Text Preprocessing

In [None]:
def remove_pattern(text, pattern):
    """
    Docstring: 
    
    remove any pattern from the input text.
    
    Parameters
    ----------
    text: string input, the text to clean.
    pattern : string input, the pattern to remove from the text input.
    
    Returns
    -------
    a cleaned string.
    
    """
    
    # find all the pattern in the input text and return a list of postion indeces 
    r = re.findall(pattern, text)
    
    # replace the pattern with an empty space
    for i in r: text = re.sub(pattern, '', text)
    
    return text

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
# lower case every word to ease the upcoming processes 
df['text'] = df['text'].str.lower()

# tokenize the text to search for any stop words to remove it
df['tokenized_text'] = df.text.apply(lambda x : x.split())

# creating a set of stopwords(if you wonder why set cuz it is faster than a list)
stopWords = set(nltk.corpus.stopwords.words('english'))
df['tokenized_text'] = df['tokenized_text'].apply(lambda x : [word for word in x if not word in stopWords])

# create a word net lemma
lemma = nltk.stem.WordNetLemmatizer()
pos = nltk.corpus.wordnet.VERB
df['tokenized_text'] = df['tokenized_text'].apply(lambda x : [lemma.lemmatize(word, pos) for word in x])

# remove any punctuation
df['tokenized_text'] = df['tokenized_text'].apply(lambda x : [ remove_pattern(word,'\.') for word in x])

# rejoin the text again to get a cleaned text
df['cleaned_text'] = df['tokenized_text'].apply(lambda x : ' '.join(x))

df.drop(labels=['tokenized_text'], axis=1, inplace=True)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Unnamed: 0,text,label,cleaned_text
0,this product so far has not disappointed. my c...,1,product far disappointed children love use lik...
1,great for beginner or experienced person. boug...,1,great beginner experience person buy gift love
2,inexpensive tablet for him to use and learn on...,1,"inexpensive tablet use learn on, step nabi thr..."
3,i've had my fire hd 8 two weeks now and i love...,1,i've fire hd 8 two weeks love it tablet great ...
4,i bought this for my grand daughter when she c...,1,"buy grand daughter come visit set user, enter ..."


## 3- Features Extraction 

### i- Bag of Words Method

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


# perform vectorization on our cleaned text 
bow_vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english', max_features=1000)

bow_features = bow_vectorizer.fit_transform(df['cleaned_text'])

bow_df = pd.DataFrame(bow_features.toarray(), columns=bow_vectorizer.get_feature_names())

bow_df.head()

Unnamed: 0,10,100,11,12,15,16,1st,20,2nd,30,35,3rd,40,4k,4th,50,8gb,ability,able,absolutely,access,account,activate,actual,actually,ad,add,addition,additional,adjust,adjustable,ads,adult,adults,advance,advantage,advertise,advertised,affordable,age,...,wake,wall,want,wanted,warranty,waste,watch,way,ways,wear,weather,web,week,weeks,weight,wemo,white,wi,wife,wifi,wire,wireless,wish,wonderful,word,work,working,works,world,worry,worth,wrong,xmas,year,years,yes,young,younger,youtube,yr
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### ii- TF-IDF Method

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_Vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, max_features=1000, stop_words='english')

tfidf_features = tfidf_Vectorizer.fit_transform(df['cleaned_text'])

tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_Vectorizer.get_feature_names())

tfidf_df.head()

Unnamed: 0,10,100,11,12,15,16,1st,20,2nd,30,35,3rd,40,4k,4th,50,8gb,ability,able,absolutely,access,account,activate,actual,actually,ad,add,addition,additional,adjust,adjustable,ads,adult,adults,advance,advantage,advertise,advertised,affordable,age,...,wake,wall,want,wanted,warranty,waste,watch,way,ways,wear,weather,web,week,weeks,weight,wemo,white,wi,wife,wifi,wire,wireless,wish,wonderful,word,work,working,works,world,worry,worth,wrong,xmas,year,years,yes,young,younger,youtube,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132138,0.0,0.149575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.124324,0.0,0.0,0.0,0.0,0.0,0.0,0.185408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4- Dataset Split

In [None]:
from sklearn.model_selection import train_test_split

X_train_bow, X_metric_bow, y_train_bow, y_metric_bow = train_test_split(bow_df, df['label'], test_size=0.2, random_state=42)
X_test_bow, X_valid_bow, y_test_bow, y_valid_bow = train_test_split(X_metric_bow, y_metric_bow, test_size=0.5, random_state=42)


X_train_tfidf, X_metric_tfidf, y_train_tfidf, y_metric_tfidf = train_test_split(tfidf_df, df['label'], test_size=0.2, random_state=42)
X_test_tfidf, X_valid_tfidf, y_test_tfidf, y_valid_tfidf = train_test_split(X_metric_tfidf, y_metric_tfidf, test_size=0.5, random_state=42)

## 5- Building a Machine Learning Model 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

clf_bow = clf_tfidf = AdaBoostClassifier(n_estimators=100, learning_rate=0.001)

In [None]:
clf_bow.fit(X_train_bow, y_train_bow)
clf_tfidf.fit(X_train_tfidf, y_train_tfidf)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                   learning_rate=0.001, n_estimators=100, random_state=None)

In [None]:
pred_bow   = clf_bow.predict(X_test_bow)
pred_tfidf = clf_tfidf.predict(X_test_tfidf)

In [None]:
!pip install colorama
from colorama import Fore, Style

print(f'AdaBoost Classifier Results: \n',
      f'{Fore.RED}Bag of words{Style.RESET_ALL} \n',
      f'Accuracy Socre: {Fore.LIGHTBLUE_EX}%0.2f %%{Style.RESET_ALL} \n'%(100 * accuracy_score(y_test_bow, pred_bow)))
print(classification_report(y_test_bow, pred_bow))

print(f'{Fore.RED}TF-IDF{Style.RESET_ALL} \n',
      f'Accuracy Socre: {Fore.LIGHTBLUE_EX}%0.2f %%{Style.RESET_ALL} \n'%(100 * accuracy_score(y_test_tfidf, pred_tfidf)))
print(classification_report(y_test_tfidf, pred_tfidf))

Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama
Successfully installed colorama-0.4.4
AdaBoost Classifier Results: 
 [31mBag of words[0m 
 Accuracy Socre: [94m93.85 %[0m 

              precision    recall  f1-score   support

           0       0.48      0.09      0.16       211
           1       0.94      0.99      0.97      3252

    accuracy                           0.94      3463
   macro avg       0.71      0.54      0.56      3463
weighted avg       0.92      0.94      0.92      3463

[31mTF-IDF[0m 
 Accuracy Socre: [94m93.82 %[0m 

              precision    recall  f1-score   support

           0       0.46      0.09      0.15       211
           1       0.94      0.99      0.97      3252

    accuracy                           0.94      3463
   macro avg       0.70      0.54      0.56      3463
wei

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_bow_knn = clf_tfidf_knn = KNeighborsClassifier()

In [None]:
clf_bow_knn.fit(X_train_bow, y_train_bow)
clf_tfidf_knn.fit(X_train_tfidf, y_train_tfidf)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
pred_bow_knn   = clf_bow_knn.predict(X_test_bow)
pred_tfidf_knn = clf_tfidf_knn.predict(X_test_tfidf)

In [None]:
print(f'KNN Classifier Results: \n',
      f'{Fore.RED}Bag of words{Style.RESET_ALL} \n',
      f'Accuracy Socre: {Fore.LIGHTBLUE_EX}%0.2f %%{Style.RESET_ALL} \n'%(100 * accuracy_score(y_test_bow, pred_bow_knn)))
print(classification_report(y_test_bow, pred_bow_knn))

print(f'{Fore.RED}TF-IDF{Style.RESET_ALL} \n',
      f'Accuracy Socre: {Fore.LIGHTBLUE_EX}%0.2f %%{Style.RESET_ALL} \n'%(100 * accuracy_score(y_test_tfidf, pred_tfidf_knn)))
print(classification_report(y_test_tfidf, pred_tfidf_knn))

KNN Classifier Results: 
 [31mBag of words[0m 
 Accuracy Socre: [94m93.82 %[0m 

              precision    recall  f1-score   support

           0       0.38      0.02      0.04       211
           1       0.94      1.00      0.97      3252

    accuracy                           0.94      3463
   macro avg       0.66      0.51      0.51      3463
weighted avg       0.91      0.94      0.91      3463

[31mTF-IDF[0m 
 Accuracy Socre: [94m93.76 %[0m 

              precision    recall  f1-score   support

           0       0.14      0.00      0.01       211
           1       0.94      1.00      0.97      3252

    accuracy                           0.94      3463
   macro avg       0.54      0.50      0.49      3463
weighted avg       0.89      0.94      0.91      3463

