# Hierarchical classification

In [1]:
import pandas as pd
import numpy as np
import text_normalizer
import hiclass.metrics
from sklearn import metrics
from scripts.build_df import build_df
from scripts.decode_id import decode_id
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from os import cpu_count
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /home/app/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
no_threshold_df = build_df(json_path='data/products.json', threshold=0) 

: 

: 

In [None]:
# df = build_df(json_path='data/products.json', threshold=100)

In [None]:
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

In [None]:
X = no_threshold_df['name'].copy()
X = normalization(X.apply(str))
y = no_threshold_df['path'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
)

In [None]:
tfid_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

### Hierarchical classification

In [None]:
# Initialize model
logistic_regression = LogisticRegression(max_iter=7000)
# Initialize hierarchical classifier with model
hierarchical_classifier = hiclass.LocalClassifierPerParentNode(n_jobs=cpu_count(), local_classifier=logistic_regression)
# Train hierarchical classifier
hierarchical_classifier.fit(X_train, y_train)

# Predict on train and test
y_pred_train = hierarchical_classifier.predict(X_train)
y_pred_test = hierarchical_classifier.predict(X_test)

In [None]:
# Print standard results
print("Train")
print("Precision:", hiclass.metrics.precision(y_train, y_pred_train))
print("Recall:", hiclass.metrics.recall(y_train, y_pred_train))
print("F1:", hiclass.metrics.f1(y_train, y_pred_train))
print("\nTest")
print("Precision:", hiclass.metrics.precision(y_test, y_pred_test))
print("Recall:", hiclass.metrics.recall(y_test, y_pred_test))
print("F1:", hiclass.metrics.f1(y_test, y_pred_test))

### Analyze predictions

In [None]:
# Base DataFrame
test_df = pd.DataFrame(y_test)
# Add prediction column respecting original index
test_df["prediction"] = pd.Series(y_pred_test.tolist(), index = test_df.index)

# Auxiliary function for creating new columns
def assign_precision(x):
    return hiclass.metrics.precision(x[0:1], x[1:2])
def assign_recall(x):
    return hiclass.metrics.recall(x[0:1], x[1:2])
def assign_f1(x):
    # Condition to avoid ZeroDivisionError
    if x["precision"] * x["recall"] == 0.0:
        return 0.0
    else:
        return 2 * x["precision"] * x["recall"] / (x["precision"] + x["recall"])

# Creat new metrics columns 
test_df["precision"] = test_df.apply(lambda x: assign_precision(x), axis=1)
test_df["recall"] = test_df.apply(lambda x: assign_recall(x), axis=1)
test_df["f1"] = test_df.apply(lambda x: assign_f1(x), axis=1)

In [None]:
test_df.head()

In [None]:
print("Total failures:", len(test_df["f1"][test_df["f1"] == 0.0]), "of", len(test_df))
test_df["f1"][test_df["f1"] == 0.0]

In [None]:
def print_path_metrics(index, df_):
    print("Product name:", no_threshold_df["name"].loc[index], "\n")
    print("Original path:", decode_id(no_threshold_df["path"].loc[index]))
    print("Threshold path:", decode_id(df_["path"].loc[index]))
    print("Predicted path:", decode_id(df_["prediction"].loc[index]))
    print("\nMetrics (on threshold and predicted path)")
    print("Precision:", df_["precision"].loc[index])
    print("Recall:", df_["recall"].loc[index])
    print("F1:", df_["f1"].loc[index])

### Correct prediction

In [None]:
print_path_metrics(13005, test_df)

### Kind of correct prediction

In [None]:
print_path_metrics(47899, test_df)

In [None]:
print_path_metrics(4702, test_df)

### Complete failures 

In [None]:
print("Total failures:", len(test_df["f1"][test_df["f1"] == 0.0]), "of", len(test_df))
test_df["f1"][test_df["f1"] == 0.0]

#### Failures (?)

In [None]:
print_path_metrics(49777, test_df)

In [None]:
print_path_metrics(11541, test_df)

In [None]:
print_path_metrics(31535, test_df)

#### Failures

In [None]:
print_path_metrics(13167, test_df)

In [None]:
print_path_metrics(24334, test_df)

In [None]:
print_path_metrics(13167, test_df)

In [None]:
print_path_metrics(51294, test_df)

#### Plots

In [None]:
sns.set()
fig, axes = plt.subplots(1,3, figsize=(18,5), sharey=True)
sns.histplot(test_df["precision"], stat="percent", ax=axes[0], bins=6)
sns.histplot(test_df["recall"], stat="percent",ax=axes[1], bins=6)
sns.histplot(test_df["f1"], stat="percent",ax=axes[2], bins=6)

#### Predict on new data

In [None]:
string_to_predict = "Neoprene Dumbbell Hand Weights"
decode_id(hierarchical_classifier.predict(tfid_vectorizer.transform(normalization(pd.Series(string_to_predict)))).tolist()[0])

### Descriptions