# Transformer approach (HuggingFace API) 

I will try DistilBERT and ALBERT for this task.

## Setup

In [13]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification


df = pd.read_csv('1429_1.csv', usecols=['id', 'reviews.text', 'reviews.title'])

In [14]:
print("Head")
print(df.head())

Head
                     id                                       reviews.text  \
0  AVqkIhwDv8e3D1O-lebb  This product so far has not disappointed. My c...   
1  AVqkIhwDv8e3D1O-lebb  great for beginner or experienced person. Boug...   
2  AVqkIhwDv8e3D1O-lebb  Inexpensive tablet for him to use and learn on...   
3  AVqkIhwDv8e3D1O-lebb  I've had my Fire HD 8 two weeks now and I love...   
4  AVqkIhwDv8e3D1O-lebb  I bought this for my grand daughter when she c...   

                             reviews.title  
0                                   Kindle  
1                                very fast  
2  Beginner tablet for our 9 year old son.  
3                                  Good!!!  
4                Fantastic Tablet for kids  


In [15]:
print("Info")
print(df.info())

Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             34660 non-null  object
 1   reviews.text   34659 non-null  object
 2   reviews.title  34654 non-null  object
dtypes: object(3)
memory usage: 812.5+ KB
None


In [16]:
print("Describe")
print(df.describe())

Describe
                          id  \
count                  34660   
unique                    42   
top     AVphgVaX1cnluZ0-DR74   
freq                   10966   

                                             reviews.text  reviews.title  
count                                               34659          34654  
unique                                              34659          19766  
top     This product so far has not disappointed. My c...  Great product  
freq                                                    1            645  


                         id  \
3083   AVsRjfwAU2_QcyX9PHqe   
3111   AVsRjfwAU2_QcyX9PHqe   
21738  AV1YnR7wglJLPUi8IJmi   
21953  AVpfl8cLLJeJML43AE3S   
25515  AVpfl8cLLJeJML43AE3S   
28578  AVzoGHhAglJLPUi8GfzY   
28680  AVpidLjVilAPnD_xEVpI   

                                            reviews.text reviews.title  
3083                                   A must have item.           NaN  
3111                                                 NaN    Five Stars  
21738  Good device for a school age child convent use...           NaN  
21953                 I like it. I use mostly for music.           NaN  
25515  I was totally stoked about this product becaus...           NaN  
28578                           I'm in LOVE with ALEXA!!           NaN  
28680  Love it ! I started with a kindle,then a fire ...           NaN  


In [21]:
print("Null values")
df.dropna(subset=['reviews.text', 'reviews.title'], how='all', inplace=True)
print(df.isnull().sum())

null_rows = df[df['reviews.text'].isnull() | df['reviews.title'].isnull()]
print(null_rows)

Null values
id               0
reviews.text     1
reviews.title    6
dtype: int64
                         id  \
3083   AVsRjfwAU2_QcyX9PHqe   
3111   AVsRjfwAU2_QcyX9PHqe   
21738  AV1YnR7wglJLPUi8IJmi   
21953  AVpfl8cLLJeJML43AE3S   
25515  AVpfl8cLLJeJML43AE3S   
28578  AVzoGHhAglJLPUi8GfzY   
28680  AVpidLjVilAPnD_xEVpI   

                                            reviews.text reviews.title  
3083                                   A must have item.           NaN  
3111                                                 NaN    Five Stars  
21738  Good device for a school age child convent use...           NaN  
21953                 I like it. I use mostly for music.           NaN  
25515  I was totally stoked about this product becaus...           NaN  
28578                           I'm in LOVE with ALEXA!!           NaN  
28680  Love it ! I started with a kindle,then a fire ...           NaN  


In [25]:
nltk.download('punkt')
nltk.download('stopwords')

exceptions = ["kindle"]

def preprocess(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    stemmer = PorterStemmer()
    tokens = [word if word in exceptions else stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

df['preprocessed_text'] = df['reviews.text'].apply(preprocess)
df['preprocessed_title'] = df['reviews.title'].apply(preprocess)

print(df.head())

     

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pedrorenan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pedrorenan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                     id                                       reviews.text  \
0  AVqkIhwDv8e3D1O-lebb  This product so far has not disappointed. My c...   
1  AVqkIhwDv8e3D1O-lebb  great for beginner or experienced person. Boug...   
2  AVqkIhwDv8e3D1O-lebb  Inexpensive tablet for him to use and learn on...   
3  AVqkIhwDv8e3D1O-lebb  I've had my Fire HD 8 two weeks now and I love...   
4  AVqkIhwDv8e3D1O-lebb  I bought this for my grand daughter when she c...   

                             reviews.title  \
0                                   Kindle   
1                                very fast   
2  Beginner tablet for our 9 year old son.   
3                                  Good!!!   
4                Fantastic Tablet for kids   

                                   preprocessed_text  \
0  product far disappoint children love use like ...   
1     great beginn experienc person bought gift love   
2  inexpens tablet use learn step nabi thrill lea...   
3  fire hd 8 two week love tab