In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scripts import tree_utils

from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load

%load_ext autoreload
%autoreload 2

  from pandas import MultiIndex, Int64Index
[nltk_data] Downloading package stopwords to /home/app/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1. The dataset 



### 1.1. 

In [2]:
df = build_df(json_path='data/products.json', 
             threshold=0, 
             preprocessed_csv=None
            )

In [3]:
df.head()

Unnamed: 0,name,description,leaf,max_depth,path,image,nm_and_desc,category,category_0,category_1,category_2,category_3,category_4,category_5,category_6
0,Duracell - AAA Batteries (4-Pack),Compatible with select electronic devices; AAA...,abcat0208002,4,"[pcmcat312300050015, pcmcat248700050021, pcmca...",http://www.bestbuy.com/site/duracell-aaa-batte...,Duracell - AAA Batteries (4-Pack) Compatible w...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
1,Duracell - AA 1.5V CopperTop Batteries (4-Pack),Long-lasting energy; DURALOCK Power Preserve t...,abcat0208002,4,"[pcmcat312300050015, pcmcat248700050021, pcmca...",http://www.bestbuy.com/site/duracell-aa-1-5v-c...,Duracell - AA 1.5V CopperTop Batteries (4-Pack...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
2,Duracell - AA Batteries (8-Pack),Compatible with select electronic devices; AA ...,abcat0208002,4,"[pcmcat312300050015, pcmcat248700050021, pcmca...",http://www.bestbuy.com/site/duracell-aa-batter...,Duracell - AA Batteries (8-Pack) Compatible wi...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
3,Energizer - MAX Batteries AA (4-Pack),4-pack AA alkaline batteries; battery tester i...,abcat0208002,4,"[pcmcat312300050015, pcmcat248700050021, pcmca...",http://www.bestbuy.com/site/energizer-max-batt...,Energizer - MAX Batteries AA (4-Pack) 4-pack A...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,
4,Duracell - C Batteries (4-Pack),Compatible with select electronic devices; C s...,abcat0208002,4,"[pcmcat312300050015, pcmcat248700050021, pcmca...",http://www.bestbuy.com/site/duracell-c-batteri...,Duracell - C Batteries (4-Pack) Compatible wit...,"[{'id': 'pcmcat312300050015', 'name': 'Connect...",pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,abcat0208002,,,


This function adds columns with path, level, and leaf to the original data, we decided to save the original columns and only the added `nm_and_desc` column  in a csv file called `data/products_v1.csv`

**Note**: new columns created by the function but not saved in the file are related to the target label, and we deal with them in further steps.

## 2. Features Normalization 

In [4]:
df[['name', 'description', 'nm_and_desc', 'category', 'image']].to_csv('data/products_v1.csv',index=False)

`name` , `description` and `nm_and_desc` are the columns that we are going to use for training our different models and it is necessary that those columns be in normalized form. To do so, we apply our `normalization()` function and save the normalized data in another csv file: 'data/normalized_data.csv'

In this way, the normalized data is available for different training processes avoiding unnecessary runs of the `normalization()` function. Re-running is reserved only for those cases when we want to try a different kind of normalization.

In [5]:
df = pd.read_csv('data/products_v1.csv')

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

# Normalize the features 
df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]

#Saved the normalized data in a csv file
#### WARNING####
# IF YOU ARE GOING TO TRY ANOTHER KIND OF NORMALIZATION, PLEASE CHANGE THE NAME OF THE OUTPUT FILE IN ORDER TO AVOID OVERWRITING

normalized_data = df.to_csv('data/normalized_data.csv', index=False)

In future training you will be able to proceed without performing again the normalization step:

```python

df = pd.read_csv('data/normalized_data.csv')

name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)

# Select X depending on which data you are going to use to train your model
X = name
```

## 3. Labels and features selection
`X` will vary depending if we choose name, description or name_and_description as feature.

**Features:**

In [6]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
X = name

**Labels**

`build_df()` function returns a new dataset with custom leaf (label) according to the threshold of min. products selected per category.

Call `build_df()` to extract the labels 

In [7]:
cat = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            ) 

In [8]:
y = cat['leaf']

Recreating the hierarchical structure of our categories applying our `make_tree()` function

We extracted the nodes from the same dataframe generated by `build_df()`

In [10]:
tree_dict = tree_utils.make_tree(cat, cat['category'], 'Categories', display_tree= True)

Categories
├── pcmcat312300050015
│   ├── pcmcat248700050021
│   │   ├── pcmcat303600050001
│   │   └── pcmcat179100050006
│   │       ├── pcmcat179200050003
│   │       ├── pcmcat179200050008
│   │       │   └── pcmcat748300322875
│   │       └── pcmcat179200050013
│   ├── abcat0802000
│   │   ├── abcat0811011
│   │   └── abcat0802001
│   │       └── pcmcat159300050002
│   ├── abcat0805000
│   │   └── abcat0511001
│   │       └── pcmcat266500050030
│   ├── pcmcat275600050000
│   │   └── abcat0807000
│   │       ├── abcat0807001
│   │       ├── pcmcat335400050008
│   │       └── abcat0807009
│   ├── abcat0809000
│   │   ├── abcat0809004
│   │   └── abcat0809002
│   ├── pcmcat249700050006
│   │   ├── pcmcat219100050010
│   │   ├── pcmcat286300050020
│   │   └── pcmcat272800050000
│   ├── pcmcat254000050002
│   │   └── pcmcat308100050020
│   │       └── pcmcat340500050007
│   └── pcmcat341100050005
│       └── pcmcat253700050018
│           └── pcmcat248300050003
├── other
├── abcat03000

**IMPORTANT**:
- Generate the labels and the tree in the **same step**. If you do not do that you will not be allowed to get the distance between predicted and true categories when apply `get_performance()` function 

- `make_tree()` print the tree if you set `display_tree= True`. `display_tree= False` only generates the tree structure (without printing it) and the dictionary of nodes 

In [23]:
tree_dict2 = tree_utils.make_tree(cat, cat['category'], 'Categories', display_tree= False)

## 4. Train/test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

## 5. Feature engineering
Try different values for `max_features` and `ngram_range` in TF-IDF. Also experimenting with and without IDF and min and max idf values.

In [12]:
tfid_vectorizer = TfidfVectorizer(max_features=1500, 
                                  ngram_range=(1, 3),
                                  use_idf=False,
                                  min_df=1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

## 6. Modeling
In this sample notebook we train a logistic regressor 

In [13]:
logreg = LogisticRegression(max_iter=2000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')

In [14]:
logreg.fit(X_train, y_train)

## 7. Evaluation

In [15]:
y_pred_test = logreg.predict(X_test)

We now apply `get_performance()` function, from which we can obtain all the relevant metrics. All the information is saved in a new folder containing different files in `model/experiments`.

The name of each experiment folder corresponds to the date and time on which the experiment was performed

**Note**: 13/12/22 version of this function adds tree functionality giving the distance between predicted and true labels. It also calculates the average value of such distance.

In [28]:
evaluation.get_performance(model=logreg,
                           pred_labels=y_pred_test, 
                           true_labels=y_test,
                           vectorizer=tfid_vectorizer,
                           average='micro',
                           tree= tree_dict)

Model Performance metrics:
------------------------------
Accuracy: 0.7820909970958374
Precision: 0.7820909970958374
Recall: 0.7820909970958374
F1 Score: 0.7820909970958374
Average distance between nodes categories: 0.4915779283639884

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.85      1.00      0.92        47
                  A/V Cables & Connectors       0.67      0.78      0.72        90
                  Action Camcorder Mounts       0.52      0.57      0.54        28
           Activity Trackers & Pedometers       0.89      0.85      0.87        39
              Adapters, Cables & Chargers       0.63      0.73      0.68        71
                         Air Conditioners       0.96      0.96      0.96        28
             Air Purifier Filters & Parts       1.00      0.76      0.86        21
                            Air Purifie

## 8. EDA post prediction

In [29]:
df_labels = pd.read_csv('/home/app/src/model/experiments/exp2022-12-13 20:15:10.910496/labels.csv')

In [30]:
df_labels.head()

Unnamed: 0,pred_cat,true_cat,pred_cat_dec,true_cat_dec,dist
0,pcmcat151600050037,pcmcat151600050037,Keyboards,Keyboards,0
1,abcat0912008,abcat0912008,Coffee Pods,Coffee Pods,0
2,abcat0507000,abcat0507000,Computer Cards & Components,Computer Cards & Components,0
3,pcmcat183800050006,pcmcat183800050006,Laptop Batteries,Laptop Batteries,0
4,pcmcat152100050038,pcmcat152100050020,Microphones,Recording Equipment,3


In [31]:
df_labels['dist'].value_counts()

0    8079
1     698
2     671
3     534
4     304
5      44
Name: dist, dtype: int64