# TRAINING
### DO NOT PUSH CHANGES IN THIS NOTEBOOK
- EXPERIMENTS RESULTS WILL BE SAVED IN STEP 6.
- MODELS WILL BE SAVED IN STEP 9.

In [65]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function can be modified between experiments.
Normalization is perfomed before train/test split to save time when trying different models. To fit the pipeline this step will be applied in each prediction so it is added in it as a wrapped transform function in first place/

In [66]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

## 2. Labels and features selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category. X will vary depending if we choose name, description or name_and_description as feature.

In [281]:
y = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            )['leaf']     

In [70]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
X = name

## 3. Train/test split

In [283]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. Also experimenting with and without IDF and min and max idf values.

In [284]:
tfid_vectorizer = TfidfVectorizer(max_features=1500, 
                                  ngram_range=(1, 3),
                                  use_idf=False,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [303]:
list(tfid_vectorizer.vocabulary_.keys())

['conair',
 '2in1',
 'hot',
 'air',
 'brush',
 'white',
 'hp',
 'desktop',
 'intel',
 '4gb',
 'memori',
 '500gb',
 'hard',
 'drive',
 'desktop intel',
 '4gb memori',
 'memori 500gb',
 '500gb hard',
 'hard drive',
 '4gb memori 500gb',
 'memori 500gb hard',
 '500gb hard drive',
 'mb',
 'class',
 'ab',
 'amplifi',
 'black',
 'amplifi black',
 'window',
 'glass',
 '2pack',
 'elit',
 'beat',
 'preown',
 'nintendo',
 'ds',
 'preown nintendo',
 'nintendo ds',
 'preown nintendo ds',
 'la',
 'playstat',
 'preown playstat',
 'canon',
 'dslr',
 'camera',
 'len',
 'dslr camera',
 'len black',
 'mini',
 'purifi',
 'blue',
 'air purifi',
 'amp',
 'decker',
 'classic',
 '4slice',
 'toaster',
 'oven',
 'black amp',
 'amp decker',
 'black amp decker',
 'lg',
 'ultra',
 'slim',
 'dvd',
 'extern',
 'usb',
 'extern usb',
 'drive black',
 '40',
 'diag',
 'led',
 '1080p',
 'smart',
 'hdtv',
 'tv',
 'diag led',
 'led 1080p',
 'tv black',
 'diag led 1080p',
 'modal',
 'folio',
 'case',
 'appl',
 'ipad',
 'fol

In [None]:
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__stop_words': ('english', None),
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              }

## 5. Modeling
Try different classifiers and compare results.

In [98]:
logreg = LogisticRegression(max_iter=7000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')
svc = SVC()
lgbm = LGBMClassifier(objective='multiclass')

In [89]:
xgb = XGBClassifier()

param_grid={'colsample_bytree': [0.5],
                    'gamma': [0.25, 1], 
                    'gpu_id': [0],
                    'learning_rate': [0.1],
                    'max_depth': [7, 9, 15, 20, 30, 50],
                    'min_child_weight': [1, 3, 5],
                    'num_classes': [105],
                    'objective': ['multi:softmax'],
                    'predictor': ['gpu_predictor'],
                    'reg_lambda': [0, 1, 3],
                    'scale_pos_weight': [1],
                    'subsample': [0.9],
                    'tree_method': ['gpu_hist'],
                    'booster': ['gbtree', 'dart'],
                    'verbosity': [2],
                    'num_parallel_tree': [4]
                    }

rand = RandomizedSearchCV(xgb, param_grid, n_jobs=-1, cv=3, scoring="accuracy")

In [286]:
logreg.fit(X_train, y_train)

In [294]:
np.argmax(logreg.coef_[10])

263

## 6. Evaluation
Predicting on train to check overfitting when comparing with test.

In [287]:
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

In [288]:
evaluation.get_performance(predictions=y_pred_train, 
                           y_test=y_train,
                           labels=y,
                           vectorizer=tfid_vectorizer,
                           model=logreg, 
                           average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.806515635589118
Precision: 0.806515635589118
Recall: 0.806515635589118
F1 Score: 0.806515635589118

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.89      0.99      0.94       190
                  A/V Cables & Connectors       0.73      0.84      0.78       361
                  Action Camcorder Mounts       0.60      0.65      0.62       114
           Activity Trackers & Pedometers       0.81      0.86      0.84       154
              Adapters, Cables & Chargers       0.76      0.80      0.78       284
                         Air Conditioners       0.98      0.94      0.96       114
             Air Purifier Filters & Parts       0.95      0.86      0.90        83
                            Air Purifiers       0.84      0.90      0.87        94
               All Cel

In [290]:
evaluation.get_performance(y_pred_test, 
                           y_test, 
                           y,
                           tfid_vectorizer,
                           logreg, 
                           average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.7820909970958374
Precision: 0.7820909970958374
Recall: 0.7820909970958374
F1 Score: 0.7820909970958374

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.85      1.00      0.92        47
                  A/V Cables & Connectors       0.67      0.78      0.72        90
                  Action Camcorder Mounts       0.52      0.57      0.54        28
           Activity Trackers & Pedometers       0.89      0.85      0.87        39
              Adapters, Cables & Chargers       0.63      0.73      0.68        71
                         Air Conditioners       0.96      0.96      0.96        28
             Air Purifier Filters & Parts       1.00      0.76      0.86        21
                            Air Purifiers       0.88      0.88      0.88        24
               All

## 7. Predict on new data
Try the model with new data, this will happen in the frontend ready UI in 'production'

In [131]:
text = 'speaker phone'
normalized = normalize_corpus(
        [text],
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
vec = tfid_vectorizer.transform(normalized)
pred = logreg.predict(vec)
decoder(pred)

array(['Audio'], dtype='<U5')

Getting predict probabilities for every category to get the top-k predictions.

In [183]:
preds = logreg.predict_proba(vec)[0]
classes = logreg.classes_
top_3 = classes[np.argsort(preds)[::-1]][:3]
decoder(top_3)

array(['Audio', 'Home Audio Accessories', 'Speakers'], dtype='<U22')

Finding the path for every leaf detected in the previous step

In [220]:
categories = pd.read_json('https://raw.githubusercontent.com/BestBuyAPIs/open-data-set/master/categories.json')

In [262]:
paths = []
for pred in top_3:
    paths.append(categories.path[categories.id == pred].values[0])
    
str_paths = []
for path in paths:
    partial = []
    for cat in path:
        partial.append(cat['name'])
    str_paths.append(' > '.join(partial))

In [265]:
str_paths

['Audio', 'Audio > Home Audio Accessories', 'Audio > Home Audio > Speakers']

## 8. Creating a Pipeline
Export best models to call them from the API with one line.

In [266]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        normalized_desc = normalize_corpus(
        X,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
        return normalized_desc
        

In [268]:
pipe = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer(max_features=1000, 
                                  ngram_range=(1, 2),
                                  use_idf=True,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                  )),
                 ('logreg', LogisticRegression(max_iter=7000, 
                                               n_jobs=-1, 
                                               multi_class='multinomial', 
                                               solver='newton-cg'))
                ])

pipe.fit(X_train,y_train)

## 9. Saving fitted pipeline to file.
This will allow to call predict in the API when the ml_service receives input data, transform it and return the top-k output full category path for the user to see it in the UI.

In [None]:
dump(pipe, 'pipe.joblib') 