# TRAINING
### DO NOT PUSH CHANGES IN THIS NOTEBOOK
- EXPERIMENTS RESULTS WILL BE SAVED IN STEP 6.
- MODELS WILL BE SAVED IN STEP 9.

In [65]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function can be modified between experiments.
Normalization is perfomed before train/test split to save time when trying different models. To fit the pipeline this step will be applied in each prediction so it is added in it as a wrapped transform function in first place/

In [66]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

## 2. Labels and features selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category. X will vary depending if we choose name, description or name_and_description as feature.

In [67]:
y = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv',
             force_leafs_under_threshold=True
            )['leaf']     

In [70]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
X = name

## 3. Train/test split

In [96]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. Also experimenting with and without IDF and min and max idf values.

In [97]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, 
                                  ngram_range=(1, 2),
                                  use_idf=True,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [None]:
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__stop_words': ('english', None),
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              }

## 5. Modeling
Try different classifiers and compare results.

In [98]:
logreg = LogisticRegression(max_iter=7000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')
svc = SVC()
lgbm = LGBMClassifier(objective='multiclass')

In [89]:
xgb = XGBClassifier()

param_grid={'colsample_bytree': [0.5],
                    'gamma': [0.25, 1], 
                    'gpu_id': [0],
                    'learning_rate': [0.1],
                    'max_depth': [7, 9, 15, 20, 30, 50],
                    'min_child_weight': [1, 3, 5],
                    'num_classes': [105],
                    'objective': ['multi:softmax'],
                    'predictor': ['gpu_predictor'],
                    'reg_lambda': [0, 1, 3],
                    'scale_pos_weight': [1],
                    'subsample': [0.9],
                    'tree_method': ['gpu_hist'],
                    'booster': ['gbtree', 'dart'],
                    'verbosity': [2],
                    'num_parallel_tree': [4]
                    }

rand = RandomizedSearchCV(xgb, param_grid, n_jobs=-1, cv=3, scoring="accuracy")

In [99]:
logreg.fit(X_train, y_train)

In [None]:
rand.fit(X_train, y_train)

In [88]:
rand.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

## 6. Evaluation
Predicting on train to check overfitting when comparing with test.

In [100]:
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

In [81]:
evaluation.get_performance(predictions=y_pred_train, 
                           y_test=y_train,
                           labels=y,
                           vectorizer=tfid_vectorizer,
                           model=rand, 
                           timestamp=time.time(), 
                           average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.352793106786717
Precision: 0.352793106786717
Recall: 0.352793106786717
F1 Score: 0.352793106786717

Model Classification report:
------------------------------
                                      precision    recall  f1-score   support

        Action Camcorder Accessories       0.00      0.00      0.00       114
         Adapters, Cables & Chargers       0.53      0.72      0.61       171
                         All Laptops       0.59      0.14      0.22       274
             Apple Watch Accessories       0.00      0.00      0.00       104
       Appliance Parts & Accessories       0.30      0.16      0.21       234
                          Appliances       0.50      0.06      0.10      1154
                               Audio       0.76      0.30      0.43       934
     Binoculars, Telescopes & Optics       0.00      0.00      0.00       149
                            Blenders       0.00      0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
evaluation.get_performance(y_pred_test, 
                           y_test, 
                           y,
                           tfid_vectorizer,
                           rand, 
                           timestamp=time.time(), 
                           average='micro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model Performance metrics:
------------------------------
Accuracy: 0.48751210067763795
Precision: 0.48751210067763795
Recall: 0.48751210067763795
F1 Score: 0.48751210067763795

Model Classification report:
------------------------------
                                      precision    recall  f1-score   support

        Action Camcorder Accessories       0.00      0.00      0.00        28
         Adapters, Cables & Chargers       0.49      0.74      0.59        43
                         All Laptops       0.67      0.93      0.78        69
             Apple Watch Accessories       0.00      0.00      0.00        26
       Appliance Parts & Accessories       0.42      0.37      0.39        59
                          Appliances       0.68      0.09      0.15       289
                               Audio       0.73      0.57      0.64       233
     Binoculars, Telescopes & Optics       0.00      0.00      0.00        37
                            Blenders       0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


## 7. Predict on new data
Try the model with new data, this will happen in the frontend ready UI in 'production'

In [131]:
text = 'speaker phone'
normalized = normalize_corpus(
        [text],
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
vec = tfid_vectorizer.transform(normalized)
pred = logreg.predict(vec)
decoder(pred)

array(['Audio'], dtype='<U5')

Getting predict probabilities for every category to get the top-k predictions.

In [183]:
preds = logreg.predict_proba(vec)[0]
classes = logreg.classes_
top_3 = classes[np.argsort(preds)[::-1]][:3]
decoder(top_3)

array(['Audio', 'Home Audio Accessories', 'Speakers'], dtype='<U22')

Finding the path for every leaf detected in the previous step

In [220]:
categories = pd.read_json('https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/categories.json')

In [262]:
paths = []
for pred in top_3:
    paths.append(categories.path[categories.id == pred].values[0])
    
str_paths = []
for path in paths:
    partial = []
    for cat in path:
        partial.append(cat['name'])
    str_paths.append(' > '.join(partial))

In [265]:
str_paths

['Audio', 'Audio > Home Audio Accessories', 'Audio > Home Audio > Speakers']

## 8. Creating a Pipeline
Export best models to call them from the API with one line.

In [None]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        normalized_desc = normalize_corpus(
        X,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
        return normalized_desc
        

In [None]:
pipe = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer(max_features=1000, 
                                           ngram_range=(1, 1))),
                 ('logreg', LogisticRegression(max_iter=7000, 
                                               n_jobs=-1, 
                                               multi_class='multinomial', 
                                               solver='newton-cg'))
                ])

pipe.fit(X_train,y_train)

## 9. Saving fitted pipeline to file.
This will allow to call predict in the API when the ml_service receives input data, transform it and return the top-k output full category path for the user to see it in the UI.

In [None]:
dump(pipe, 'pipe.joblib') 