# EDA Prediction Results

In [22]:
import os
os.chdir("/home/app/src")
import pandas as pd
import numpy as np
import text_normalizer
from model import evaluation
from sklearn import metrics
from scripts.build_df import build_df
from scripts.decode_id import decode_id
from scripts.tree_utils import make_tree
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [6]:
df = build_df(json_path='data/products.json', threshold=100)

In [32]:
no_threshold_df = build_df(json_path='data/products.json', threshold=0)

In [7]:
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

In [13]:
X = df['name'].copy()
X = normalization(X.apply(str))
y = df['leaf'].copy()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    stratify=y
)

In [18]:
tfid_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

### Hierarchical classification

In [19]:
# Initialize model
random_forest = RandomForestClassifier(n_jobs=-1)
random_forest.fit(X_train, y_train)

# Predict on train and test
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)

In [62]:
tree = make_tree(df, df["category"], "root", display_tree=False)

In [63]:
evaluation.get_performance(random_forest, y_pred_test, np.array(y_test), tfid_vectorizer, "micro", tree)

Model Performance metrics:
------------------------------
Accuracy: 0.815972894482091
Precision: 0.815972894482091
Recall: 0.815972894482091
F1 Score: 0.815972894482091
Average distance between nodes categories: 0.4089060987415295

Model Classification report:
------------------------------
                                           precision    recall  f1-score   support

                      3D Printer Filament       0.94      1.00      0.97        47
                  A/V Cables & Connectors       0.73      0.77      0.75        90
                  Action Camcorder Mounts       0.50      0.71      0.59        28
           Activity Trackers & Pedometers       0.82      0.82      0.82        39
              Adapters, Cables & Chargers       0.75      0.75      0.75        71
                         Air Conditioners       0.96      0.93      0.95        28
             Air Purifier Filters & Parts       1.00      0.81      0.89        21
                            Air Purifiers  

In [52]:
results = pd.read_csv("model/experiments/exp2022-12-14 11:09:07.414935/results.csv")
results.rename(columns={"Unnamed: 0":"leaf"}, inplace=True)
# Drop compound columns
results = results[:-3]
results["leaf"] = results["leaf"].apply(lambda x: decode_id(x))

In [59]:
results.sort_values(by="support", ascending=False).head(40)

Unnamed: 0,leaf,precision,recall,f1-score,support
148,Pre-Owned Games,0.994406,1.0,0.997195,711.0
92,other,0.608247,0.636691,0.622144,556.0
61,Cell Phone Cases & Clips,0.912234,0.927027,0.919571,370.0
142,iPhone Cases & Clips,0.867257,0.945338,0.904615,311.0
114,Dash Installation Kits,0.928105,0.959459,0.943522,148.0
185,Bluetooth & Wireless Speakers,0.850649,0.922535,0.885135,142.0
202,All Refrigerators,0.773973,0.818841,0.795775,138.0
134,Sheet Music,0.838235,0.919355,0.876923,124.0
155,"Cases, Covers & Keyboard Folios",0.819672,0.892857,0.854701,112.0
54,Printer Ink,0.923077,0.96,0.941176,100.0
