In [86]:
import pandas as pd
import numpy as np
import time
from xgboost import XGBClassifier
from model.text_normalizer import normalize_corpus, stopword_list
from model import evaluation
from model.utils import decoder
from scripts.build_df import build_df
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scripts import tree_utils
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import dump, load
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
df = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv=None
            )

In [11]:
df[['name', 'description','category', 'image']].to_csv('data/products_v1.csv',index=False)

In [12]:
df = pd.read_csv('data/products_v1.csv')

def normalization(input):
    output = normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))
df['description'] = normalization(df['description'].apply(str))
df['name_and_description'] = [' '.join(i) for i in zip(df['name'], df['description'])]

normalized_data = df.to_csv('data/normalized_data.csv', index=False)

In [47]:
name = df['name'].apply(str)
description = df['description'].apply(str)
name_and_description = df['name_and_description'].apply(str)
X = name_and_description

In [88]:
cat = build_df(json_path='data/products.json', 
             threshold=100, 
             preprocessed_csv='data/normalized_data.csv'
            ) 

In [153]:
y = cat.iloc[:,6:].fillna('Unknown')

In [155]:
y

Unnamed: 0,category_0,category_1,category_2,category_3,category_4,category_5,category_6
0,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,Unknown,Unknown,Unknown,Unknown
1,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,Unknown,Unknown,Unknown,Unknown
2,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,Unknown,Unknown,Unknown,Unknown
3,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,Unknown,Unknown,Unknown,Unknown
4,pcmcat312300050015,pcmcat248700050021,pcmcat303600050001,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...
51641,abcat0900000,abcat0916000,pcmcat303700050016,Unknown,Unknown,Unknown,Unknown
51642,abcat0900000,abcat0916000,abcat0916008,Unknown,Unknown,Unknown,Unknown
51643,abcat0500000,abcat0515000,abcat0504001,pcmcat186100050005,Unknown,Unknown,Unknown
51644,abcat0100000,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [143]:
tree_dict = tree_utils.make_tree(cat, cat['category'], 'Categories', display_tree= True)

Categories
├── pcmcat312300050015
│   ├── pcmcat248700050021
│   │   ├── pcmcat303600050001
│   │   └── pcmcat179100050006
│   │       ├── pcmcat179200050003
│   │       ├── pcmcat179200050008
│   │       │   └── pcmcat748300322875
│   │       └── pcmcat179200050013
│   ├── abcat0802000
│   │   ├── abcat0811011
│   │   └── abcat0802001
│   │       └── pcmcat159300050002
│   ├── abcat0805000
│   │   └── abcat0511001
│   │       └── pcmcat266500050030
│   ├── pcmcat275600050000
│   │   └── abcat0807000
│   │       ├── abcat0807001
│   │       ├── pcmcat335400050008
│   │       └── abcat0807009
│   ├── abcat0809000
│   │   ├── abcat0809004
│   │   └── abcat0809002
│   ├── pcmcat249700050006
│   │   ├── pcmcat219100050010
│   │   ├── pcmcat286300050020
│   │   └── pcmcat272800050000
│   ├── pcmcat254000050002
│   │   └── pcmcat308100050020
│   │       └── pcmcat340500050007
│   └── pcmcat341100050005
│       └── pcmcat253700050018
│           └── pcmcat248300050003
├── other
├── abcat03000

In [156]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify = y
)

In [157]:
tfid_vectorizer = TfidfVectorizer(max_features=None, 
                                  ngram_range=(1, 2),
                                  use_idf=False,
                                  min_df=0.1,
                                  norm='l2',
                                  smooth_idf=True
                                 ) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [13]:
logreg = LogisticRegression(max_iter=2000, 
                            n_jobs=-1, 
                            multi_class='multinomial', 
                            solver='newton-cg')

In [136]:
xgb = XGBClassifier(colsample_bytree= 0.7,
                    gamma = 0.25, 
                    learning_rate = 0.2,
                    max_depth = 5,
                    min_child_weight = 1,
                    reg_lambda=3,
                    scale_pos_weight=0.5,
                    subsample= 0.7,
                    booster='gbtree',
                    verbosity= 1,
                    n_jobs=-1)

In [159]:
clf = OneVsRestClassifier(XGBClassifier(colsample_bytree= 0.7,
                    gamma = 0.25, 
                    learning_rate = 0.2,
                    max_depth = 5,
                    min_child_weight = 1,
                    reg_lambda=3,
                    scale_pos_weight=0.5,
                    subsample= 0.7,
                    booster='gbtree',
                    verbosity= 1,
                    n_jobs=-1), n_jobs=-1)

params = {'xgb__colsample_bytree': [0.7],
                    'xgb__gamma': [0.25, 0.5, 1], 
                    'xgb__learning_rate': [0.2],
                    'xgb__max_depth': [3, 5],
                    'xgb__min_child_weight': [1, 5],
                    'xgb__reg_lambda': [0, 1, 3],
                    'xgb__scale_pos_weight': [0.5],
                    'xgb__subsample': [0.7],
                    'xgb__booster': ['gbtree'],
                    'xgb__verbosity': [2],
                    'xgb__num_parallel_tree': [4],
                    'xgb__n_jobs' : [-1]
                    }

In [115]:
rand = RandomizedSearchCV(clf, params, n_jobs=-1, cv=3, scoring="accuracy")

In [158]:
y_train = mlb.fit_transform(y_train.values.tolist())

In [171]:
y_test = mlb.transform(y_test.values.tolist())

In [160]:
clf.fit(X_train, y_train)

In [192]:
clf.score(X_train, y_train)

0.08355116661825927

In [161]:
y_pred_test = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

In [162]:
probs_test = clf.predict_proba(X_test)
probs_train = clf.predict_proba(X_train)

In [193]:
evaluation.get_performance(model=clf,
                           pred_labels=mlb.inverse_transform(y_pred_test), 
                           true_labels=mlb.inverse_transform(y_test),
                           probs=probs_test,
                           vectorizer=tfid_vectorizer,
                           average='micro',
                           tree= tree_dict)

  true_labels = np.array(true_labels)
  args = [asarray(arg) for arg in args]


ValueError: unknown is not supported

In [55]:
evaluation.get_performance(model=xgb,
                           pred_labels=le.inverse_transform(y_pred_train), 
                           true_labels=le.inverse_transform(y_train),
                           probs=probs_train,
                           vectorizer=tfid_vectorizer,
                           average='micro',
                           tree= tree_dict)

Model Performance metrics:
------------------------------
Accuracy: 0.9452028269919643
Precision: 0.9452028269919643
Recall: 0.9452028269919643
F1 Score: 0.9452028269919643
Average distance between nodes categories: 0.10823893890986543
Top 5 Score: 0.02369542066027689

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.99      0.96      0.98       190
                  A/V Cables & Connectors       0.97      0.99      0.98       361
                  Action Camcorder Mounts       1.00      0.99      1.00       114
           Activity Trackers & Pedometers       0.98      0.99      0.99       154
              Adapters, Cables & Chargers       0.99      0.98      0.99       284
                         Air Conditioners       1.00      0.97      0.99       114
             Air Purifier Filters & Parts       0.99      1.00      0.99        83
     

In [74]:
df_labels = pd.read_csv('/home/app/src/model/experiments/exp2022-12-15 05:48:58.634262/labels.csv')
df_labels['features'] = tfid_vectorizer.inverse_transform(X_train)
check = df_labels[df_labels.pred_cat != df_labels.true_cat]