# TRAINING
### DO NOT PUSH CHANGES IN THIS NOTEBOOK
- EXPERIMENTS RESULTS WILL BE SAVED IN STEP 6.
- MODELS WILL BE SAVED IN STEP 9.

In [49]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Features normalization
Arguments of 'normalization' function can be modified between experiments.
Normalization is perfomed before train/test split to save time when trying different models. To fit the pipeline this step will be applied in each prediction so it is added in it as a wrapped transform function in first place/

In [10]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]
normalized_data = df.to_csv('data/normalized_data.csv', index=False)

## 2. Labels and features selection
'build_df' function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category. X will vary depending if we choose name, description or name_and_description as feature.

In [58]:
y = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv',
             force_leafs_under_threshold=True
            )['leaf']     

In [59]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
X = name

## 3. Train/test split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 4. Feature engineering
Try different values for max_features and ngram_range in TF-IDF. Also experimenting with and without IDF and min and max idf values.

In [66]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, 
                                  ngram_range=(1, 2),
                                  use_idf=False,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [None]:
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'tfidf__max_df': [0.25, 0.5, 0.75, 1.0],
              'tfidf__max_features': [10, 50, 100, 250, 500, 1000, None],
              'tfidf__stop_words': ('english', None),
              'tfidf__smooth_idf': (True, False),
              'tfidf__norm': ('l1', 'l2', None),
              }

## 5. Modeling
Try different classifiers and compare results.

In [41]:
logreg = LogisticRegression(max_iter=7000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')
svc = SVC()
lgbm = LGBMClassifier(objective='multiclass')

In [67]:
logreg.fit(X_train, y_train)

## 6. Evaluation
Predicting on train to check overfitting when comparing with test.

In [68]:
y_pred_train = logreg.predict(X_train)
y_pred_test = logreg.predict(X_test)

In [70]:
evaluation.get_performance(predictions=y_pred_train, 
                           y_test=y_train,
                           labels=y,
                           vectorizer=tfid_vectorizer,
                           model=logreg, 
                           timestamp=time.time(), 
                           average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.8016748959240972
Precision: 0.8016748959240972
Recall: 0.8016748959240972
F1 Score: 0.8016748959240972

Model Classification report:
------------------------------
                                      precision    recall  f1-score   support

        Action Camcorder Accessories       0.54      0.54      0.54       114
         Adapters, Cables & Chargers       0.72      0.78      0.75       171
                         All Laptops       0.94      0.99      0.96       274
             Apple Watch Accessories       0.93      0.79      0.85       104
       Appliance Parts & Accessories       0.85      0.66      0.74       234
                          Appliances       0.79      0.65      0.71      1154
                               Audio       0.76      0.77      0.77       934
     Binoculars, Telescopes & Optics       0.97      0.97      0.97       149
                            Blenders       0.83      0.90      

In [69]:
evaluation.get_performance(y_pred_test, 
                           y_test, 
                           y,
                           tfid_vectorizer,
                           logreg, 
                           timestamp=time.time(), 
                           average='micro')

Model Performance metrics:
------------------------------
Accuracy: 0.7774443368828654
Precision: 0.7774443368828654
Recall: 0.7774443368828654
F1 Score: 0.7774443368828654

Model Classification report:
------------------------------
                                      precision    recall  f1-score   support

        Action Camcorder Accessories       0.64      0.64      0.64        28
         Adapters, Cables & Chargers       0.71      0.74      0.73        43
                         All Laptops       0.88      0.96      0.92        69
             Apple Watch Accessories       0.86      0.73      0.79        26
       Appliance Parts & Accessories       0.72      0.44      0.55        59
                          Appliances       0.75      0.63      0.68       289
                               Audio       0.77      0.74      0.76       233
     Binoculars, Telescopes & Optics       0.95      1.00      0.97        37
                            Blenders       0.68      0.83      

## 7. Predict on new data
Try the model with new data, this will happen in the frontend ready UI in 'production'

In [112]:
text = 'storage'
normalized = normalize_corpus(
        [text],
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
vec = tfid_vectorizer.transform(normalized)
preds = logreg.predict(vec)
decoder(preds)

array(['other'], dtype='<U5')

## 8. Creating a Pipeline
Export best models to call them from the API with one line.

In [None]:
class Normalizer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X = X.copy()
        normalized_desc = normalize_corpus(
        X,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list,
    )
        return normalized_desc
        

In [None]:
pipe = Pipeline([('normalizer', Normalizer()), 
                 ('tfidf', TfidfVectorizer(max_features=1000, 
                                           ngram_range=(1, 1))),
                 ('logreg', LogisticRegression(max_iter=7000, 
                                               n_jobs=-1, 
                                               multi_class='multinomial', 
                                               solver='newton-cg'))
                ])

pipe.fit(X_train,y_train)

## 9. Saving fitted pipeline to file.
This will allow to call predict in the API when the ml_service receives input data, transform it and return the top-k output full category path for the user to see it in the UI.

In [None]:
dump(pipe, 'pipe.joblib') 