# Amazon UK product classificatio with NLP

## 1 - Import dataset

### 1.1 download dataset

In [None]:
! mkdir -p data/raw
!curl -o './data/raw/amazon_reviews_multilingual_UK_v1_00.tsv.gz' 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_UK_v1_00.tsv.gz'

### 1.2 unpack gz

In [None]:
!gunzip './data/raw/amazon_reviews_multilingual_UK_v1_00.tsv.gz'

## Data Preparation

In [36]:
import pandas as pd
df = pd.read_csv('./data/raw/amazon_reviews_multilingual_UK_v1_00.tsv',  sep='\t', error_bad_lines=False)

b'Skipping line 101882: expected 15 fields, saw 22\nSkipping line 115512: expected 15 fields, saw 22\n'
b'Skipping line 328404: expected 15 fields, saw 22\n'
b'Skipping line 1412642: expected 15 fields, saw 22\n'


In [37]:
# remove unnecessary columns
df = df[['product_id', 'product_parent', 'product_title', 'product_category', 'review_body', 'review_headline']]

In [38]:
df = df.dropna()

In [39]:
df.head()

Unnamed: 0,product_id,product_parent,product_title,product_category,review_body,review_headline
0,B00MWK7BWG,307651059,My Favourite Faded Fantasy,Music,The best album ever!,Five Stars
1,B006CHML4I,835010224,Seiko 5 Men's Automatic Watch with Black Dial ...,Watches,What a great watch. Both watches and strap is ...,Great watch from casio.
2,B00IIFCJX0,271687675,Dexter Season 8,Digital_Video_Download,"love watching all the episodes of Dexter, when...",fantastic
3,B000W7JWUA,211383699,The Settlers of Catan Board Game - discontinue...,Toys,Excellent game!!!,Five Stars
4,B005JTAP4S,182965893,Peter: A Darkened Fairytale (Vol 1),Digital_Ebook_Purchase,"This cute, quick read is very different to say...",A twist on Tales


In [42]:
df[['product_category']].groupby(['product_category']).size()

product_category
Apparel                          2
Automotive                     530
Baby                          4328
Beauty                           2
Books                       257803
Camera                        6427
Digital_Ebook_Purchase      289112
Digital_Music_Purchase       29264
Digital_Video_Download       31422
Electronics                   5846
Health & Personal Care         246
Home                          2690
Home Entertainment             117
Home Improvement               950
Kitchen                         21
Lawn and Garden                240
Luggage                         10
Mobile_Apps                 218031
Music                       329865
Musical Instruments           2832
Office Products                984
PC                           16258
Personal_Care_Appliances       100
Pet Products                    43
Shoes                         1718
Software                        77
Sports                        2336
Toys                         24496
Vid

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.7, test_size=0.3, random_state=42)

y_train = train.product_category
X_train = train[['product_title', 'review_body', 'review_headline']]

y_test = test.product_category
X_test = test[['product_title', 'review_body', 'review_headline']]

del df

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

In [52]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder


product_title_pipeline = Pipeline([
    ('product_title', TextSelector(key='product_title')),
    ('tfidf', TfidfVectorizer(stop_words='english'))
])


review_body_pipeline = Pipeline([
    ('review_body', TextSelector(key='review_body')),
    ('tfidf', TfidfVectorizer(stop_words='english'))
])


review_headline_pipeline = Pipeline([
    ('product_title', TextSelector(key='review_headline')),
    ('tfidf', TfidfVectorizer(stop_words='english'))
])

In [53]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([
    ('text', product_title_pipeline), 
    ('length', review_body_pipeline),
    ('words', review_headline_pipeline)
])

## Naive Bayes

In [61]:
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ('features',feats),
    ('classifier', MultinomialNB(fit_prior=False)),
])

pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('product_title',
                                                                  TextSelector(key='product_title')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
     

In [62]:
from sklearn.metrics import accuracy_score
predicted = pipeline.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 87.16%


# Support Vector Machines (SVM)

In [13]:
from sklearn.linear_model import SGDClassifier

In [17]:
pipeline2 = Pipeline([
    ('features',feats),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter_no_change=5, random_state=42)),
])

pipeline2.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('text',
                                                 Pipeline(memory=None,
                                                          steps=[('product_title',
                                                                  TextSelector(key='product_title')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
     

In [56]:
predicted = pipeline2.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 84.78%


# Refs
- https://www.kaggle.com/baghern/a-deep-dive-into-sklearn-pipelines
- https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
- https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a