In [2]:
import re
import string
import json
import os
import shutil
from pprint import pprint
from functools import partial
from operator import itemgetter, attrgetter
from os.path import normpath
from typing import Callable

import joblib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from gensim.parsing.preprocessing import STOPWORDS
from sacremoses import MosesTokenizer, MosesTruecaser
from sklearn.base import clone
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector,
    make_column_transformer,
)
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import (
    VarianceThreshold,
    SelectKBest,
    SelectPercentile,
    GenericUnivariateSelect,
    f_classif,
    mutual_info_classif,
    chi2,
)
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid, NearestNeighbors
from sklearn.linear_model import (
    LogisticRegression,
    LogisticRegressionCV,
    PassiveAggressiveClassifier,
    Perceptron,
    RidgeClassifier,
    RidgeClassifierCV,
    SGDClassifier,
)
from sklearn.naive_bayes import (
    BernoulliNB,
    CategoricalNB,
    ComplementNB,
    GaussianNB,
    MultinomialNB,
)
from sklearn.svm import LinearSVC, NuSVC, OneClassSVM, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    StratifiedKFold,
    train_test_split,
)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.impute import KNNImputer
# from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.preprocessing import (
    OneHotEncoder,
    Binarizer,
    FunctionTransformer,
    MaxAbsScaler,
    MinMaxScaler,
    minmax_scale,
    Normalizer,
    normalize,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    PolynomialFeatures,
    MultiLabelBinarizer,
)
import sklearn.utils as skl_utils
from sklearn.metrics import accuracy_score, balanced_accuracy_score, classification_report

import dask
import dask.dataframe as dd

# Set Seaborn theme and default palette
sns.set_theme(font_scale=1.25, style="darkgrid")
sns.set_palette("deep", desat=0.85, color_codes=True)

# Turn on inline plotting
%matplotlib inline

# Load Black auto-formatter
%load_ext nb_black



<IPython.core.display.Javascript object>

In [3]:
# Import my modules
from tools import cleaning, plotting, language as lang, outliers, utils
from tools.sklearn.vectorizers import FreqVectorizer, Doc2Vectorizer, AverageVectorizer
from tools.sklearn.classification import diagnostics as diag
from tools.sklearn import selection

# Run time-consuming grid searches
RUN_SWEEPS = True

# Set my default MPL settings
plt.rcParams.update(plotting.MPL_DEFAULTS)

# Enable automatic reloading
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

In [4]:
df = pd.read_parquet("data/model_text", engine="pyarrow")
display(df.head())
df.shape

Unnamed: 0_level_0,text,title,brand,sub_cat,main_cat
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6090113980,prayer rug carpet mat gebetsteppich islamic mu...,prayer rug carpet mat gebetsteppich islamic mu...,bonballoon,"[home & kitchen, home dcor, area rugs, runners...",all beauty
8867326759,moleskine payne's grey card wallet (moleskine ...,moleskine payne's grey card wallet (moleskine ...,sunatoria,,all beauty
9623402791,hello kitty taupe embossed face wallet loungef...,hello kitty taupe embossed face wallet loungef...,hello_kitty,,all beauty
9742121109,estee lauder resilience lift night firming/scu...,estee lauder resilience lift night firming/scu...,chom,,all beauty
B00004TUBN,better living the dispenser classic chrome iii...,better living the dispenser classic chrome iii,classic,"[home & kitchen, bathroom accessories, holders...",all beauty


(2814882, 5)

<IPython.core.display.Javascript object>

In [5]:
df = df.dropna(subset=["sub_cat"])
df

Unnamed: 0_level_0,text,title,brand,sub_cat,main_cat
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6090113980,prayer rug carpet mat gebetsteppich islamic mu...,prayer rug carpet mat gebetsteppich islamic mu...,bonballoon,"[home & kitchen, home dcor, area rugs, runners...",all beauty
B00004TUBN,better living the dispenser classic chrome iii...,better living the dispenser classic chrome iii,classic,"[home & kitchen, bathroom accessories, holders...",all beauty
B000050AP3,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton_beach,"[home & kitchen, heating, cooling & air qualit...",all beauty
B000050AUD,sonicare pl-4 (4700) sonic toothbrush. you cou...,sonicare pl-4 (4700) sonic toothbrush,philips,"[industrial & scientific, professional dental ...",all beauty
B000050FDE,oral-b professional care 1000 power toothbrush...,oral-b professional care 1000 power toothbrush,oral_b,"[industrial & scientific, professional dental ...",all beauty
...,...,...,...,...,...
B01HJ149LI,god eater resurrection - ps vita [digital code...,god eater resurrection - ps vita [digital code],bandai,"[video games, playstation vita, digital games ...",video games
B01HJ14FDA,jojo eyes of heaven complete bundle - ps4 [dig...,jojo eyes of heaven complete bundle - ps4 [dig...,bandai,"[video games, playstation 4, digital games & d...",video games
B01HJ14OT0,the technomancer - ps4 [digital code]. the tec...,the technomancer - ps4 [digital code],focus_home_interactive,"[video games, playstation 4, digital games & d...",video games
B01HJ14TTA,lego star wars: the force awakens season pass ...,lego star wars: the force awakens season pass ...,warner_bros,"[video games, playstation 4, digital games & d...",video games


<IPython.core.display.Javascript object>

In [6]:
df.main_cat.value_counts().to_frame().style.bar()

Unnamed: 0,main_cat
amazon home,530265
amazon fashion,415093
sports & outdoors,276602
toys & games,225656
automotive,220492
tools & home improvement,211979
cell phones & accessories,110820
computers,85353
grocery,84664
office products,82073


<IPython.core.display.Javascript object>

In [7]:
df = utils.prune_categories(df, "main_cat", 1000)
df

                         Dropped
collectibles & fine art      884
prime pantry                   7


Unnamed: 0_level_0,text,title,brand,sub_cat,main_cat
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6090113980,prayer rug carpet mat gebetsteppich islamic mu...,prayer rug carpet mat gebetsteppich islamic mu...,bonballoon,"[home & kitchen, home dcor, area rugs, runners...",all beauty
B00004TUBN,better living the dispenser classic chrome iii...,better living the dispenser classic chrome iii,classic,"[home & kitchen, bathroom accessories, holders...",all beauty
B000050AP3,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton_beach,"[home & kitchen, heating, cooling & air qualit...",all beauty
B000050AUD,sonicare pl-4 (4700) sonic toothbrush. you cou...,sonicare pl-4 (4700) sonic toothbrush,philips,"[industrial & scientific, professional dental ...",all beauty
B000050FDE,oral-b professional care 1000 power toothbrush...,oral-b professional care 1000 power toothbrush,oral_b,"[industrial & scientific, professional dental ...",all beauty
...,...,...,...,...,...
B01HJ149LI,god eater resurrection - ps vita [digital code...,god eater resurrection - ps vita [digital code],bandai,"[video games, playstation vita, digital games ...",video games
B01HJ14FDA,jojo eyes of heaven complete bundle - ps4 [dig...,jojo eyes of heaven complete bundle - ps4 [dig...,bandai,"[video games, playstation 4, digital games & d...",video games
B01HJ14OT0,the technomancer - ps4 [digital code]. the tec...,the technomancer - ps4 [digital code],focus_home_interactive,"[video games, playstation 4, digital games & d...",video games
B01HJ14TTA,lego star wars: the force awakens season pass ...,lego star wars: the force awakens season pass ...,warner_bros,"[video games, playstation 4, digital games & d...",video games


<IPython.core.display.Javascript object>

In [8]:
min_support = 0.001

pruned_df = []

for i, (main_cat, cat_df) in enumerate(df.groupby("main_cat")):
    cat_df = cat_df.explode("sub_cat")
    cat_df = utils.prune_categories(
        cat_df,
        "sub_cat",
        min_support,
        show_report=False,
    )
    cat_df = utils.implode(cat_df, "sub_cat")
    train, test = train_test_split(cat_df, random_state=i)
    cat_df["test_set"] = cat_df.index.isin(test.index)
    pruned_df.append(cat_df)

pruned_df = pd.concat(pruned_df, axis=0)
pruned_df

Unnamed: 0_level_0,text,title,brand,sub_cat,main_cat,test_set
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6090113980,prayer rug carpet mat gebetsteppich islamic mu...,prayer rug carpet mat gebetsteppich islamic mu...,bonballoon,"[home & kitchen, home dcor]",all beauty,False
B00004TUBN,better living the dispenser classic chrome iii...,better living the dispenser classic chrome iii,classic,"[home & kitchen, bathroom accessories, holders...",all beauty,True
B000050AP3,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton beach 05521 trueair 3.5 gal. cool mis...,hamilton_beach,"[home & kitchen, heating, cooling & air quality]",all beauty,False
B000050AUD,sonicare pl-4 (4700) sonic toothbrush. you cou...,sonicare pl-4 (4700) sonic toothbrush,philips,"[industrial & scientific, professional dental ...",all beauty,False
B000050FDE,oral-b professional care 1000 power toothbrush...,oral-b professional care 1000 power toothbrush,oral_b,"[industrial & scientific, professional dental ...",all beauty,True
...,...,...,...,...,...,...
B01HJ149LI,god eater resurrection - ps vita [digital code...,god eater resurrection - ps vita [digital code],bandai,"[video games, playstation vita, digital games ...",video games,False
B01HJ14FDA,jojo eyes of heaven complete bundle - ps4 [dig...,jojo eyes of heaven complete bundle - ps4 [dig...,bandai,"[video games, playstation 4, digital games & d...",video games,True
B01HJ14OT0,the technomancer - ps4 [digital code]. the tec...,the technomancer - ps4 [digital code],focus_home_interactive,"[video games, playstation 4, digital games & d...",video games,False
B01HJ14TTA,lego star wars: the force awakens season pass ...,lego star wars: the force awakens season pass ...,warner_bros,"[video games, playstation 4, digital games & d...",video games,False


<IPython.core.display.Javascript object>


# Modeling
## Train-Test Split

In [9]:
mlb = MultiLabelBinarizer()

<IPython.core.display.Javascript object>

In [10]:
main_category = "video games"

# Index by 'main_cat'
if pruned_df.index.name != "main_cat":
    pruned_df.set_index("main_cat", inplace=True)

# Prepare train and test masks
test = pruned_df.loc[main_category, "test_set"]
train = ~test

# Define X and y
X = pruned_df.loc[main_category, "text"]
y = pruned_df.loc[main_category, "sub_cat"]
y = mlb.fit_transform(y)

# Slice out train and test sets
X_train = X[train]
X_test = X[test]
y_train = y[train]
y_test = y[test]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16830,), (5611,), (16830, 50), (5611, 50))

<IPython.core.display.Javascript object>

In [11]:
mlb.classes_

array(['40 objects per level', 'accessories', 'accessory kits',
       'batteries & chargers', 'cables', 'cables & adapters',
       'cases & storage', 'chargers', "children's",
       'clothing, shoes & jewelry', 'computers & accessories', 'consoles',
       'controllers', 'currency & subscription cards', 'digital games',
       'digital games & dlc', 'downloadable content', 'electronics',
       'faceplates, protectors & skins', 'game boy advance',
       'game boy color', 'gamecube', 'gamepads & standard controllers',
       'games', 'headsets', 'interactive gaming figures', 'kids & family',
       'nintendo 3ds', 'nintendo 3ds & 2ds', 'nintendo 64', 'nintendo ds',
       'nintendo nes', 'playstation', 'playstation 2', 'playstation 3',
       'playstation 4', 'playstation vita', 'remotes',
       'retro gaming & microconsoles', 'sega dreamcast', 'sega genesis',
       'skins', 'software', 'sony psp', 'super nintendo', 'toys & games',
       'video games', 'wii u', 'xbox 360', 'xbox 

<IPython.core.display.Javascript object>

In [12]:
fv = FreqVectorizer(
    lowercase=False,
    decode_html_entities=False,
)

fv

FreqVectorizer(decode_html_entities=False, lowercase=False)

<IPython.core.display.Javascript object>

In [13]:
pipe = Pipeline(
    [
        ("vec", fv),
        ("cls", "passthrough"),
    ],
    memory="pipeline_cache",
)
pipe

Pipeline(memory='pipeline_cache',
         steps=[('vec',
                 FreqVectorizer(decode_html_entities=False, lowercase=False)),
                ('cls', 'passthrough')])

<IPython.core.display.Javascript object>

In [14]:
pipe.fit_transform(X_train, y_train)

<16830x55470 sparse matrix of type '<class 'numpy.float64'>'
	with 1889121 stored elements in Compressed Sparse Row format>

<IPython.core.display.Javascript object>

# Baseline RF

In [15]:
def print_report(pipe, X_test=X_test, y_test=y_test):
    print(
        classification_report(y_test, pipe.predict(X_test), target_names=mlb.classes_)
    )

<IPython.core.display.Javascript object>

In [16]:
rf = RandomForestClassifier(n_jobs=-1)

pipe.set_params(cls=rf)
pipe.fit(X_train, y_train)
print_report(pipe)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                 precision    recall  f1-score   support

           40 objects per level       1.00      0.93      0.96        27
                    accessories       0.94      0.92      0.93      1062
                 accessory kits       1.00      0.17      0.29        81
           batteries & chargers       0.97      0.32      0.48        92
                         cables       1.00      0.15      0.26        54
              cables & adapters       1.00      0.13      0.22        71
                cases & storage       1.00      0.41      0.59       116
                       chargers       0.90      0.13      0.23        67
                     children's       1.00      0.43      0.60       109
      clothing, shoes & jewelry       1.00      0.83      0.90        23
        computers & accessories       0.00      0.00      0.00        33
                       consoles       0.94      0.18      0.30        83
                    controllers       0.95      0.

<IPython.core.display.Javascript object>

In [17]:
cls_grid = (
    {
        "cls": [
            ExtraTreesClassifier(),
            RandomForestClassifier(),
            DecisionTreeClassifier(),
            RidgeClassifierCV(),
            KNeighborsClassifier(),
            RadiusNeighborsClassifier(),
            DummyClassifier(strategy="stratified"),
        ],
        "vec__use_idf": [True, False],
        "vec__binary": [True, False],
        "vec__norm": ["l1", "l2", None],
    },
)


cls_grid

({'cls': [ExtraTreesClassifier(),
   RandomForestClassifier(),
   DecisionTreeClassifier(),
   RidgeClassifierCV(alphas=array([ 0.1,  1. , 10. ])),
   KNeighborsClassifier(),
   RadiusNeighborsClassifier(),
   DummyClassifier(strategy='stratified')],
  'vec__use_idf': [True, False],
  'vec__binary': [True, False],
  'vec__norm': ['l1', 'l2', None]},)

<IPython.core.display.Javascript object>

In [18]:
if RUN_SWEEPS:
    gs = selection.sweep(
        pipe,
        cls_grid,
        X=X_train,
        y=y_train,
        dst="sweeps/multilabel_classifier",
        n_jobs=-1,
        scoring="f1_weighted",
    )

gs

Fitting 5 folds for each of 84 candidates, totalling 420 fits


 0.66708968 0.66716895 0.68136358 0.67380991 0.69363692 0.69621121
 0.7153277  0.71577855 0.71718018 0.71863208 0.71879868 0.72216398
 0.71292077 0.71845844 0.71748793 0.71681718 0.7222084  0.72208574
 0.91177796 0.91171831 0.91283546 0.91330332 0.91584351 0.91871352
 0.91394281 0.91431842 0.91771399 0.91226247 0.92049479 0.92239214
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.62504635 0.70080573 0.75301039 0.79445163 0.60381919 0.72575729
 0.67361248 0.68342516 0.71495063 0.70637199 0.66150614 0.67880568
 0.50586904 0.50586904        nan        nan        nan        nan
 0.50586904 0.50586904        nan        nan        nan        nan
 0.50520114 0.50608696 0.50568603 0.50779702 0.50686636 0.50762224
 0.50755745 0.50555794 0.50570379 0.50669675 0.50606832 0.50544745]


'sweeps\\multilabel_classifier.joblib'

GridSearchCV(estimator=Pipeline(memory='pipeline_cache',
                                steps=[('vec',
                                        FreqVectorizer(decode_html_entities=False,
                                                       lowercase=False)),
                                       ('cls',
                                        RandomForestClassifier(n_jobs=-1))]),
             n_jobs=-1,
             param_grid=[{'cls': [ExtraTreesClassifier(),
                                  RandomForestClassifier(),
                                  DecisionTreeClassifier(),
                                  RidgeClassifierCV(alphas=array([ 0.1,  1. , 10. ])),
                                  KNeighborsClassifier(),
                                  RadiusNeighborsClassifier(),
                                  DummyClassifier(strategy='stratified')],
                          'vec__binary': [True, False],
                          'vec__norm': ['l1', 'l2', None],
              

<IPython.core.display.Javascript object>

In [19]:
cv_results = selection.prune_cv(gs.cv_results_)
cv_results.head(15)

Unnamed: 0,param_cls,binary,norm,use_idf,params,mean_fit_time,mean_score,rank_score
0,DecisionTreeClassifier(),False,,False,"{'cls': DecisionTreeClassifier(), 'vec__binary...",26.677168,0.922392,1
1,DecisionTreeClassifier(),False,,True,"{'cls': DecisionTreeClassifier(), 'vec__binary...",25.249241,0.920495,2
2,DecisionTreeClassifier(),True,,False,"{'cls': DecisionTreeClassifier(), 'vec__binary...",28.083394,0.918714,3
3,DecisionTreeClassifier(),False,l2,True,"{'cls': DecisionTreeClassifier(), 'vec__binary...",52.778937,0.917714,4
4,DecisionTreeClassifier(),True,,True,"{'cls': DecisionTreeClassifier(), 'vec__binary...",28.0745,0.915844,5
5,DecisionTreeClassifier(),False,l1,False,"{'cls': DecisionTreeClassifier(), 'vec__binary...",60.487655,0.914318,6
6,DecisionTreeClassifier(),False,l1,True,"{'cls': DecisionTreeClassifier(), 'vec__binary...",67.687819,0.913943,7
7,DecisionTreeClassifier(),True,l2,False,"{'cls': DecisionTreeClassifier(), 'vec__binary...",56.575219,0.913303,8
8,DecisionTreeClassifier(),True,l2,True,"{'cls': DecisionTreeClassifier(), 'vec__binary...",67.734212,0.912835,9
9,DecisionTreeClassifier(),False,l2,False,"{'cls': DecisionTreeClassifier(), 'vec__binary...",59.38002,0.912262,10


<IPython.core.display.Javascript object>