# TRAINING
### DO NOT PUSH CHANGES IN THIS NOTEBOOK
- EXPERIMENTS RESULTS WILL BE SAVED IN STEP 6.
- MODELS WILL BE SAVED IN STEP 9.

In [659]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from scripts.tree_utils import make_tree, dist_nodes
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function can be modified between experiments.
Normalization is perfomed before train/test split to save time when trying different models. To fit the pipeline this step will be applied in each prediction so it is added in it as a wrapped transform function in first place/

In [662]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(text):
    output = normalize_corpus(
        text,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

## 2. Labels and features selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category. X will vary depending if we choose name, description or name_and_description as feature.

In [669]:
df_100 = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            )

In [670]:
df_0 = build_df(json_path='data/products.json', 
             threshold=0, 
             preprocessed_csv='data/normalized_data.csv'
            ) 

In [671]:
name = df['name']
description = df['description']
name_and_description = df['name_and_description']
y_0 = df_0['leaf']
y_100 = df_100['leaf']

## 3. Train/test split

In [672]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(
    name, y_0,
    test_size=0.20, 
    random_state=42,
    stratify = y 
)

In [673]:
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(
    name, y_100,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. Also experimenting with and without IDF and min and max idf values.

In [714]:
tfid_vectorizer = TfidfVectorizer(max_features = 5000, 
                                 ngram_range = (1,3), 
                                 use_idf=False)

X_train_0_vec = tfid_vectorizer.fit_transform(X_train_0)
X_test_0_vec = tfid_vectorizer.transform(X_test_0)

In [716]:
tfid_vectorizer = TfidfVectorizer(max_features = 5000, 
                                 ngram_range = (1,3), 
                                 use_idf=False)

X_train_100_vec = tfid_vectorizer.fit_transform(X_train_100)
X_test_100_vec = tfid_vectorizer.transform(X_test_100)

## 5. Modeling
Try different classifiers and compare results.

In [98]:
logreg = LogisticRegression(max_iter=7000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')
svc = SVC()
lgbm = LGBMClassifier(objective='multiclass')

In [89]:
xgb = XGBClassifier()

param_grid={'colsample_bytree': [0.5],
                    'gamma': [0.25, 1], 
                    'gpu_id': [0],
                    'learning_rate': [0.1],
                    'max_depth': [7, 9, 15, 20, 30, 50],
                    'min_child_weight': [1, 3, 5],
                    'num_classes': [105],
                    'objective': ['multi:softmax'],
                    'predictor': ['gpu_predictor'],
                    'reg_lambda': [0, 1, 3],
                    'scale_pos_weight': [1],
                    'subsample': [0.9],
                    'tree_method': ['gpu_hist'],
                    'booster': ['gbtree', 'dart'],
                    'verbosity': [2],
                    'num_parallel_tree': [4]
                    }

rand = RandomizedSearchCV(xgb, param_grid, n_jobs=-1, cv=3, scoring="accuracy")

In [717]:
logreg.fit(X_train_0_vec, y_train_0)

In [725]:
logreg.fit(X_train_100_vec, y_train_100)

## 6. Evaluation
Predicting on train to check overfitting when comparing with test.

In [720]:
y_pred_train_0 = logreg.predict(X_train_0_vec)
y_pred_test_0 = logreg.predict(X_test_0_vec)

In [746]:
y_pred_train_100 = logreg.predict(X_train_100_vec)
y_pred_test_100 = logreg.predict(X_test_100_vec)

In [751]:
probs = logreg.predict_proba(X_test_100_vec)

In [None]:
tree_dict = make_tree(df_100, df_100['category'], 'Categories', display_tree= True)

In [750]:
evaluation.get_performance(logreg,
                           y_pred_train_100, 
                           y_train_100,
                           probs,
                           'micro',
                           tree_dict,
                           tfid_vectorizer
                           )

Model Performance metrics:
------------------------------
Accuracy: 0.8526962919934166
Precision: 0.8526962919934166
Recall: 0.8526962919934166
F1 Score: 0.8526962919934166
Average distance between nodes categories: 0.32919450091974056
Top 5 Score: 0.03439345531997289

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.94      0.97      0.96       190
                  A/V Cables & Connectors       0.75      0.87      0.81       361
                  Action Camcorder Mounts       0.67      0.75      0.71       114
           Activity Trackers & Pedometers       0.83      0.88      0.85       154
              Adapters, Cables & Chargers       0.77      0.81      0.79       284
                         Air Conditioners       0.98      0.96      0.97       114
             Air Purifier Filters & Parts       0.90      0.87      0.88        83
     

In [752]:
evaluation.get_performance(logreg,
                           y_pred_test_100, 
                           y_test_100,
                           probs,
                           'micro',
                           tree_dict,
                           tfid_vectorizer
                           )

Model Performance metrics:
------------------------------
Accuracy: 0.8191674733785091
Precision: 0.8191674733785091
Recall: 0.8191674733785091
F1 Score: 0.8191674733785091
Average distance between nodes categories: 0.40609874152952563
Top 5 Score: 0.031945788964181994

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.89      1.00      0.94        47
                  A/V Cables & Connectors       0.68      0.81      0.74        90
                  Action Camcorder Mounts       0.59      0.61      0.60        28
           Activity Trackers & Pedometers       0.94      0.85      0.89        39
              Adapters, Cables & Chargers       0.66      0.72      0.69        71
                         Air Conditioners       1.00      0.96      0.98        28
             Air Purifier Filters & Parts       0.94      0.81      0.87        21
    

In [290]:
evaluation.get_performance(y_pred_test, 
                           y_test, 
                           y_0,
                           tfid_vectorizer,
                           logreg, 
                           average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.7820909970958374
Precision: 0.7820909970958374
Recall: 0.7820909970958374
F1 Score: 0.7820909970958374

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.85      1.00      0.92        47
                  A/V Cables & Connectors       0.67      0.78      0.72        90
                  Action Camcorder Mounts       0.52      0.57      0.54        28
           Activity Trackers & Pedometers       0.89      0.85      0.87        39
              Adapters, Cables & Chargers       0.63      0.73      0.68        71
                         Air Conditioners       0.96      0.96      0.96        28
             Air Purifier Filters & Parts       1.00      0.76      0.86        21
                            Air Purifiers       0.88      0.88      0.88        24
               All

In [None]:
rand.predict(X_train, y_train)

Analyzing results from baseline logreg model with 'names'.

In [393]:
results_train = pd.read_csv('model/experiments/exp2022-12-12 12:28:42.619746/results.csv')
results_train.iloc[:213,0] = results_train.iloc[:213,0]

In [408]:
bad_score = results_train.iloc[:213,:].sort_values(by='f1-score').head(10)

In [457]:
comparison = pd.DataFrame()
comparison['label'], comparison['prediction'] = y_train, y_pred_train

In [458]:
comparison['features'] = tfid_vectorizer.inverse_transform(X_train)

In [467]:
check = comparison[comparison.prediction != comparison.label]

In [474]:
check

Unnamed: 0,label,prediction,features
13187,pcmcat254000050002,abcat0712000,"[white, window, glass, 2pack]"
32127,abcat0507000,pcmcat186100050005,"[drive, black, lg, ultra, slim, dvd, extern, u..."
41193,pcmcat226900050013,pcmcat748301695443,"[la, cuisin, pan, cream, la cuisin]"
43863,abcat0700000,other,"[soni, one, video]"
41789,pcmcat191200050015,pcmcat214700050000,"[case, appl, case appl, speck, candyshel, grip..."
...,...,...,...
45430,abcat0700000,pcmcat296300050018,[playstat]
9879,abcat0807009,other,"[black, cartridg, cartridg black, panason]"
43763,pcmcat195200050001,pcmcat233000050008,"[black, len, len black, nikon, af]"
45987,abcat0515042,abcat0107000,"[black, digit]"


In [None]:
check.label = decoder(check.label)
check.prediction = decoder(check.prediction)

In [661]:
make_tree(df2, df2.category, 'Categories')

Categories
├── pcmcat312300050015
│   ├── pcmcat248700050021
│   │   ├── pcmcat303600050001
│   │   └── pcmcat179100050006
│   │       ├── pcmcat179200050003
│   │       ├── pcmcat179200050008
│   │       │   └── pcmcat748300322875
│   │       └── pcmcat179200050013
│   ├── abcat0802000
│   │   ├── abcat0811011
│   │   └── abcat0802001
│   │       └── pcmcat159300050002
│   ├── abcat0805000
│   │   └── abcat0511001
│   │       └── pcmcat266500050030
│   ├── pcmcat275600050000
│   │   └── abcat0807000
│   │       ├── abcat0807001
│   │       ├── pcmcat335400050008
│   │       └── abcat0807009
│   ├── abcat0809000
│   │   ├── abcat0809004
│   │   └── abcat0809002
│   ├── pcmcat249700050006
│   │   ├── pcmcat219100050010
│   │   ├── pcmcat286300050020
│   │   └── pcmcat272800050000
│   ├── pcmcat254000050002
│   │   └── pcmcat308100050020
│   │       └── pcmcat340500050007
│   └── pcmcat341100050005
│       └── pcmcat253700050018
│           └── pcmcat248300050003
├── other
├── abcat03000

269

In [521]:
check.head(50)

Unnamed: 0,label,prediction,features
13187,pcmcat254000050002,abcat0712000,"[white, window, glass, 2pack]"
32127,abcat0507000,pcmcat186100050005,"[drive, black, lg, ultra, slim, dvd, extern, u..."
41193,pcmcat226900050013,pcmcat748301695443,"[la, cuisin, pan, cream, la cuisin]"
43863,abcat0700000,other,"[soni, one, video]"
41789,pcmcat191200050015,pcmcat214700050000,"[case, appl, case appl, speck, candyshel, grip..."
22542,pcmcat344400050007,pcmcat242000050002,"[air, appl, ipad, appl ipad, ipad air, appl ip..."
14751,pcmcat152100050027,pcmcat309300050003,"[audio, red, music, turntabl]"
41997,pcmcat367400050001,abcat0916000,"[black, kit, select, kit select, delux, ge, pr..."
32789,abcat0107000,abcat0107015,"[cabl, audioquest, inwal, ethernet, blackr]"
50308,abcat0200000,abcat0100000,"[black, refurbish, portabl, cd, player, player..."


## 7. Predict on new data
Try the model with new data, this will happen in the frontend ready UI in 'production'

In [131]:
text = 'speaker phone'
normalized = normalize_corpus(
        [text],
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
vec = tfid_vectorizer.transform(normalized)
pred = logreg.predict(vec)
decoder(pred)

array(['Audio'], dtype='<U5')

Getting predict probabilities for every category to get the top-k predictions.

In [183]:
preds = logreg.predict_proba(vec)[0]
classes = logreg.classes_
top_3 = classes[np.argsort(preds)[::-1]][:3]
decoder(top_3)

array(['Audio', 'Home Audio Accessories', 'Speakers'], dtype='<U22')

Finding the path for every leaf detected in the previous step

In [220]:
categories = pd.read_json('https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/categories.json')

In [262]:
paths = []
for pred in top_3:
    paths.append(categories.path[categories.id == pred].values[0])
    
str_paths = []
for path in paths:
    partial = []
    for cat in path:
        partial.append(cat['name'])
    str_paths.append(' > '.join(partial))

In [265]:
str_paths

['Audio', 'Audio > Home Audio Accessories', 'Audio > Home Audio > Speakers']

## 8. Creating a Pipeline
Export best models to call them from the API with one line.

In [266]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        normalized_desc = normalize_corpus(
        X,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
        return normalized_desc
        

In [268]:
pipe = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer(max_features=1000, 
                                  ngram_range=(1, 2),
                                  use_idf=True,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                  )),
                 ('logreg', LogisticRegression(max_iter=7000, 
                                               n_jobs=-1, 
                                               multi_class='multinomial', 
                                               solver='newton-cg'))
                ])

pipe.fit(X_train,y_train)

## 9. Saving fitted pipeline to file.
This will allow to call predict in the API when the ml_service receives input data, transform it and return the top-k output full category path for the user to see it in the UI.

In [None]:
dump(pipe, 'pipe.joblib') 