# HiClass baseline - descriptions

In [1]:
import os
os.chdir("/home/app/src")

'/home/app/src'

In [1]:
import pandas as pd
import numpy as np
import text_normalizer
import hiclass.metrics
from sklearn import metrics
from scripts.build_df import build_df
from scripts.decode_id import decode_id
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from os import cpu_count
from hiclass import LocalClassifierPerParentNode
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /home/app/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'data/products.json'

In [3]:
no_threshold_df = build_df(json_path='data/products.json', threshold=0) 

In [5]:
# Auxiliary function for text normalization
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

# Auxiliary function for creating new columns
def assign_precision(x):
    return hiclass.metrics.precision(x[0:1], x[1:2])
def assign_recall(x):
    return hiclass.metrics.recall(x[0:1], x[1:2])
def assign_f1(x):
    # Condition to avoid ZeroDivisionError
    if x["precision"] * x["recall"] == 0.0:
        return 0.0
    else:
        return 2 * x["precision"] * x["recall"] / (x["precision"] + x["recall"])

In [6]:
X = no_threshold_df['description'].copy()
X = normalization(X.apply(str))
y = no_threshold_df['path'].copy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
)

In [8]:
tfid_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

### Hierarchical classification

In [9]:
# Initialize model
logistic_regression = LogisticRegression(max_iter=7000)
# Initialize hierarchical classifier with model
hierarchical_classifier = hiclass.LocalClassifierPerParentNode(n_jobs=cpu_count(), local_classifier=logistic_regression)
# Train hierarchical classifier
hierarchical_classifier.fit(X_train, y_train)

# Predict on train and test
y_pred_train = hierarchical_classifier.predict(X_train)
y_pred_test = hierarchical_classifier.predict(X_test)

# Results DataFrame
test_df = pd.DataFrame(y_test)
test_df["prediction"] = pd.Series(y_pred_test.tolist(), index = test_df.index) 
test_df["precision"] = test_df.apply(lambda x: assign_precision(x), axis=1)
test_df["recall"] = test_df.apply(lambda x: assign_recall(x), axis=1)
test_df["f1"] = test_df.apply(lambda x: assign_f1(x), axis=1)

In [10]:
# Print standard results
print("Train")
print("Precision:", hiclass.metrics.precision(y_train, y_pred_train))
print("Recall:", hiclass.metrics.recall(y_train, y_pred_train))
print("F1:", hiclass.metrics.f1(y_train, y_pred_train))
print("\nTest")
print("Precision:", hiclass.metrics.precision(y_test, y_pred_test))
print("Recall:", hiclass.metrics.recall(y_test, y_pred_test))
print("F1:", hiclass.metrics.f1(y_test, y_pred_test))
print("Total failures:", len(test_df["f1"][test_df["f1"] == 0.0]), "of", len(test_df))

Train
Precision: 0.8564198171532543
Recall: 0.9311247945167312
F1: 0.8922112724469466

Test
Precision: 0.81171686364214
Recall: 0.884020261854015
F1: 0.8463271146915412


In [None]:
sns.set()
fig, axes = plt.subplots(1,3, figsize=(18,5), sharey=True)
sns.histplot(test_df["precision"], stat="percent", ax=axes[0], bins=6)
sns.histplot(test_df["recall"], stat="percent",ax=axes[1], bins=6)
sns.histplot(test_df["f1"], stat="percent",ax=axes[2], bins=6)