# AVG Model

In this notebook, we perform our first exploration of the combination of different models previously trained


In [8]:
import os
os.chdir('/home/app/src/')

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model import utils 
from utils.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from utils import tree_utils
from sklearn.metrics import top_k_accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 0. Labels

In [10]:
cat = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            ) 

In [11]:
y = cat['leaf']

In [12]:
tree_dict = tree_utils.make_tree(cat, cat['category'], 'Categories', display_tree= True)

Categories
├── pcmcat312300050015
│   ├── pcmcat248700050021
│   │   ├── pcmcat303600050001
│   │   └── pcmcat179100050006
│   │       ├── pcmcat179200050003
│   │       ├── pcmcat179200050008
│   │       │   └── pcmcat748300322875
│   │       └── pcmcat179200050013
│   ├── abcat0802000
│   │   ├── abcat0811011
│   │   └── abcat0802001
│   │       └── pcmcat159300050002
│   ├── abcat0805000
│   │   └── abcat0511001
│   │       └── pcmcat266500050030
│   ├── pcmcat275600050000
│   │   └── abcat0807000
│   │       ├── abcat0807001
│   │       ├── pcmcat335400050008
│   │       └── abcat0807009
│   ├── abcat0809000
│   │   ├── abcat0809004
│   │   └── abcat0809002
│   ├── pcmcat249700050006
│   │   ├── pcmcat219100050010
│   │   ├── pcmcat286300050020
│   │   └── pcmcat272800050000
│   ├── pcmcat254000050002
│   │   └── pcmcat308100050020
│   │       └── pcmcat340500050007
│   └── pcmcat341100050005
│       └── pcmcat253700050018
│           └── pcmcat248300050003
├── other
├── abcat03000

## 1. Model A (BL0): Trained with tittle

### Features

In [13]:
df = pd.read_csv('data/normalized_data.csv')

In [14]:
df

Unnamed: 0,name,description,nm_and_desc,category,image,name_and_description
0,duracel aaa batteri 4pack,compat select electron devic aaa size duralock...,Duracell - AAA Batteries (4-Pack) Compatible w...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",http://www.bestbuy.com/site/duracell-aaa-batte...,duracel aaa batteri 4pack compat select electr...
1,duracel aa 15v coppertop batteri 4pack,longlast energi duralock power preserv technol...,Duracell - AA 1.5V CopperTop Batteries (4-Pack...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",http://www.bestbuy.com/site/duracell-aa-1-5v-c...,duracel aa 15v coppertop batteri 4pack longlas...
2,duracel aa batteri 8pack,compat select electron devic aa size duralock ...,Duracell - AA Batteries (8-Pack) Compatible wi...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",http://www.bestbuy.com/site/duracell-aa-batter...,duracel aa batteri 8pack compat select electro...
3,energ max batteri aa 4pack,4pack aa alkalin batteri batteri tester includ,Energizer - MAX Batteries AA (4-Pack) 4-pack A...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",http://www.bestbuy.com/site/energizer-max-batt...,energ max batteri aa 4pack 4pack aa alkalin ba...
4,duracel c batteri 4pack,compat select electron devic c size duralock p...,Duracell - C Batteries (4-Pack) Compatible wit...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",http://www.bestbuy.com/site/duracell-c-batteri...,duracel c batteri 4pack compat select electron...
...,...,...,...,...,...,...
51641,honeywel true hepa replac filter select honeyw...,compat select honeywel air purifi model captur...,Honeywell - True HEPA Replacement Filters for ...,"[{'id': 'abcat0900000', 'name': 'Appliances'},...",http://www.bestbuy.com/site/honeywell-true-hep...,honeywel true hepa replac filter select honeyw...
51642,dyson hard floor wipe dyson hard dc56 vacuum 1...,remov dirt grime hard floor cloth materi 12 wipe,Dyson - Hard Floor Wipes for Dyson Hard DC56 V...,"[{'id': 'abcat0900000', 'name': 'Appliances'},...",http://www.bestbuy.com/site/dyson-hard-floor-w...,dyson hard floor wipe dyson hard dc56 vacuum 1...
51643,aleratec drive enclosur intern black,1 x total bay 1 x 25 bay,Aleratec - Drive Enclosure - Internal - Black ...,"[{'id': 'abcat0500000', 'name': 'Computers & T...",http://www.bestbuy.com/site/aleratec-drive-enc...,aleratec drive enclosur intern black 1 x total...
51644,amazon fire tv stick,stream 1080p content dualband dualantenna wifi...,Amazon - Fire TV Stick Streams 1080p content; ...,"[{'id': 'abcat0100000', 'name': 'TV & Home The...",http://www.bestbuy.com/site/amazon-fire-tv-sti...,amazon fire tv stick stream 1080p content dual...


In [15]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
image = df['image']

In [16]:
X_a = name

### Train/Test split

In [17]:
X_a_train, X_a_test, y_train, y_test = train_test_split(
    X_a, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

In [18]:
X_a_train.head()

7029                conair suprem 2in1 hot air brush white
26164    hp slimlin desktop intel pentium 4gb memori 50...
46217    mb quart discu 1200w class sq ab bridgeabl 2ch...
13187                  sabr window glass alarm 2pack white
41483                   elit beat agent preown nintendo ds
Name: name, dtype: object

### Feature engineering

In [19]:
tfid_vectorizer_BL0 = TfidfVectorizer(max_features=5000,
                                      ngram_range=(1, 3),
                                      use_idf=False,
                                     ) 
tfid_vectorizer_BL0.fit(X_a_train)
#joblib.dump(tfid_vectorizer_BL0, '/home/app/src/model/vect_BL0')

In [20]:
X_a_train = tfid_vectorizer_BL0.transform(X_a_train)
X_a_test = tfid_vectorizer_BL0.transform(X_a_test)

In [21]:
X_a_train[0]

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

### Modelling

In [22]:
logreg_BL0 = LogisticRegression(max_iter=7000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg',
                            random_state=42)

In [23]:
logreg_BL0.fit(X_a_train, y_train)

In [24]:
# save the model 
# filename = '/home/app/src/model/model_BL0'
# joblib.dump(logreg_BL0, filename)

In [25]:
y_pred_a = logreg_BL0.predict(X_a_test)
y_pred_a.shape

(10330,)

In [26]:
y_pred_a_prob = logreg_BL0.predict_proba(X_a_test)

In [27]:
y_pred_a_prob.shape

(10330, 213)

In [28]:
len(y_pred_a_prob[0])

213

### Evaluation

In [29]:
evaluation.get_performance(model=logreg_BL0,
                           pred_labels=y_pred_a, 
                           true_labels=y_test,
                           vectorizer=tfid_vectorizer_BL0,
                           probs=y_pred_a_prob,
                           average='micro',
                           tree= tree_dict)

Model Performance metrics:
------------------------------
Accuracy: 0.8191674733785091
Precision: 0.8191674733785091
Recall: 0.8191674733785091
F1 Score: 0.8191674733785091
Average distance between nodes categories: 0.40609874152952563
Top 5 Score: 0.9613746369796708

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.89      1.00      0.94        47
                  A/V Cables & Connectors       0.68      0.81      0.74        90
                  Action Camcorder Mounts       0.59      0.61      0.60        28
           Activity Trackers & Pedometers       0.94      0.85      0.89        39
              Adapters, Cables & Chargers       0.66      0.72      0.69        71
                         Air Conditioners       1.00      0.96      0.98        28
             Air Purifier Filters & Parts       0.94      0.81      0.87        21
      

## 2. Model B: Trained with name and descriptions

In [30]:
X_b = name_and_description
X_b.head()

0    duracel aaa batteri 4pack compat select electr...
1    duracel aa 15v coppertop batteri 4pack longlas...
2    duracel aa batteri 8pack compat select electro...
3    energ max batteri aa 4pack 4pack aa alkalin ba...
4    duracel c batteri 4pack compat select electron...
Name: name_and_description, dtype: object

In [31]:
X_b_train, X_b_test, y_train, y_test = train_test_split(
    X_b, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

In [32]:
X_b_train.head()

7029     conair suprem 2in1 hot air brush white 150 wat...
26164    hp slimlin desktop intel pentium 4gb memori 50...
46217    mb quart discu 1200w class sq ab bridgeabl 2ch...
13187    sabr window glass alarm 2pack white sabr windo...
41483    elit beat agent preown nintendo ds prepar rock...
Name: name_and_description, dtype: object

Feature engeneetring

In [33]:
tfid_vectorizer_BL1 = TfidfVectorizer(max_features=3000,
                                  ngram_range=(1, 2),
                                  use_idf=False,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 )

In [34]:
tfid_vectorizer_BL1.fit(X_b_train)
#joblib.dump(tfid_vectorizer_BL1, '/home/app/src/model/vect_BL1')

In [35]:
X_b_train = tfid_vectorizer_BL1.transform(X_b_train)
X_b_test = tfid_vectorizer_BL1.transform(X_b_test)

In [36]:
X_b_train[0]

<1x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

### Modelling

In [37]:
logreg_BL1 = LogisticRegression(max_iter=2500, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg',
                            random_state=42)

In [38]:
logreg_BL1.fit(X_b_train, y_train)

In [39]:
# save the model to disk
# filename = '/home/app/src/model/model_BL1'
# joblib.dump(logreg_BL1, filename)

In [40]:
y_pred_b = logreg_BL1.predict(X_b_test)

In [41]:
y_pred_b_prob = logreg_BL1.predict_proba(X_b_test)

### Evaluation

In [42]:
evaluation.get_performance(model=logreg_BL1,
                           pred_labels=y_pred_b, 
                           true_labels=y_test,
                           vectorizer=tfid_vectorizer_BL1,
                           probs=y_pred_b_prob,
                           average='micro',
                           tree= tree_dict)

Model Performance metrics:
------------------------------
Accuracy: 0.8207163601161666
Precision: 0.8207163601161666
Recall: 0.8207163601161666
F1 Score: 0.8207163601161666
Average distance between nodes categories: 0.3940948693126815
Top 5 Score: 0.965053242981607

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.87      1.00      0.93        47
                  A/V Cables & Connectors       0.68      0.84      0.76        90
                  Action Camcorder Mounts       0.64      0.75      0.69        28
           Activity Trackers & Pedometers       0.95      0.95      0.95        39
              Adapters, Cables & Chargers       0.67      0.73      0.70        71
                         Air Conditioners       0.96      0.96      0.96        28
             Air Purifier Filters & Parts       1.00      0.86      0.92        21
        

## 3. Model C: Trained with predicted probabilities given by models A and B

In [43]:
y_pred_prob_C = np.array([(prob1 + prob2) * 0.5 for prob1, prob2 in zip(y_pred_a_prob, y_pred_b_prob)])

In [44]:
y_pred_prob_C.shape

(10330, 213)

In [45]:
tk_c = top_k_accuracy_score(y_test, y_pred_prob_C, k=5)
tk_a = top_k_accuracy_score(y_test,y_pred_a_prob, k=5)
tk_b = top_k_accuracy_score(y_test,y_pred_b_prob, k=5)
print(f""" 
Top K=5 accuracy score:
----------------------
Model A: {tk_a}
Model B: {tk_b}
Model C(avg): {tk_c}
""")


 
Top K=5 accuracy score:
----------------------
Model A: 0.9613746369796708
Model B: 0.965053242981607
Model C(avg): 0.9706679574056147



## Extracting categories for the API

In [46]:
# all models were trained on the same label, so if indifferent the classes of which model we take 
labels = logreg_BL1.classes_


In [60]:
def get_feat_max(cat_prob, prod_idx, max_k_feat, classes):
    """Given a array of predicted probability of classes for one product returns a dictionary with the names of the k classes with the highest probability"""
    most_prob_cat_idx = np.argsort(-cat_prob[prod_idx])[:max_k_feat]
    name_cat_max= []
    
    for idx in most_prob_cat_idx:
      nm_cat = classes[idx]
      name_cat_max.append(nm_cat)

    dict_max_feat = {}
    for items in range(len(name_cat_max)):
        dict_max_feat[str(items+1)] = utils.decode_id_path(name_cat_max[items])

    return dict_max_feat 

In [61]:
#Model A
prob_cat_max_a = np.sort(-y_pred_a_prob[0])[:5]
most_prob_cat_idx_a = np.argsort(-y_pred_a_prob[0])[:5]
print(most_prob_cat_idx_a)
print(prob_cat_max_a)

[101  92  16  39  60]
[-0.75610682 -0.0212806  -0.00728905 -0.00705707 -0.00622776]


In [64]:
#Model B
prob_cat_max_b = np.sort(-y_pred_b_prob[0])[:5]
most_prob_cat_idx_b = np.argsort(-y_pred_b_prob[0])[:5]
print(most_prob_cat_idx_b)
print(prob_cat_max_b)

[101  39  16  13  92]
[-0.8483222  -0.04935319 -0.01089907 -0.00571699 -0.00557107]


In [63]:
most_prob_cat_idx_C = np.argsort(-y_pred_prob_C[0])[:5]
print(most_prob_cat_idx_C)

[101  39  92  16  13]


In [65]:
# categories model A
dict_a = get_feat_max(cat_prob= y_pred_a_prob,
                      prod_idx= 0,
                      max_k_feat=5,
                      classes= labels)
dict_a

{'1': ['Musical Instruments', 'Keyboards'],
 '2': ['other'],
 '3': ['Musical Instruments', 'Musical Instrument Accessories'],
 '4': ['Computers & Tablets',
  'Computer Accessories & Peripherals',
  'Mice & Keyboards',
  'Computer Keyboards'],
 '5': ['Cell Phones',
  'Cell Phone Accessories',
  'Cell Phone Batteries & Power']}

In [66]:
### categories model B
dict_b = get_feat_max(cat_prob= y_pred_b_prob,
                      prod_idx= 0,
                      max_k_feat=5,
                      classes= labels)
dict_b

{'1': ['Musical Instruments', 'Keyboards'],
 '2': ['Computers & Tablets',
  'Computer Accessories & Peripherals',
  'Mice & Keyboards',
  'Computer Keyboards'],
 '3': ['Musical Instruments', 'Musical Instrument Accessories'],
 '4': ['Musical Instruments'],
 '5': ['other']}

In [67]:
#categories model C
dict_c = get_feat_max(cat_prob= y_pred_prob_C,
                      prod_idx= 0,
                      max_k_feat=5,
                      classes= labels)
dict_c

{'1': ['Musical Instruments', 'Keyboards'],
 '2': ['Computers & Tablets',
  'Computer Accessories & Peripherals',
  'Mice & Keyboards',
  'Computer Keyboards'],
 '3': ['other'],
 '4': ['Musical Instruments', 'Musical Instrument Accessories'],
 '5': ['Musical Instruments']}

In [49]:
name_sample = "Casio - Portable Keyboard with 61 Touch-Sensitive Keys - Black/Silver "
descr_sample = "CASIO Portable Keyboard with 61 Touch-Sensitive Keys: MIDI and USB connectivity; 600 AHL keyboard voices; 180 rhythms; 152 songs; auto accompaniment"
true_label_sample = 'Keyboards'
