# HiClass Policies Comparison

In [1]:
import os
os.chdir("/home/app/src")

In [2]:
import time
import pandas as pd
import numpy as np
import text_normalizer
import hiclass.metrics
from sklearn import metrics
from scripts.build_df import build_df
from scripts.decode_id import decode_id
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from os import cpu_count
from hiclass import LocalClassifierPerNode, LocalClassifierPerParentNode

[nltk_data] Downloading package stopwords to /home/app/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Base df and auxiliary functions

In [3]:
def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

In [4]:
# Auxiliary function for creating new columns
def assign_precision(x):
    return hiclass.metrics.precision(x[0:1], x[1:2])
def assign_recall(x):
    return hiclass.metrics.recall(x[0:1], x[1:2])
def assign_f1(x):
    # Condition to avoid ZeroDivisionError
    if x["precision"] * x["recall"] == 0.0:
        return 0.0
    else:
        return 2 * x["precision"] * x["recall"] / (x["precision"] + x["recall"])

In [5]:
# Base dataframe
no_threshold_df = build_df(json_path='data/products.json', threshold=0) 

# Train test split
X = no_threshold_df['name'].copy()
X = normalization(X.apply(str))
y = no_threshold_df['path'].copy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
)

# Initialize vectorizer
tfid_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [6]:
train_comparison_df = pd.DataFrame(columns=["train_failures", "train_precision", "train_recall", "train_f1"])
test_comparison_df = pd.DataFrame(columns=["test_failures", "test_precision", "test_recall", "test_f1"])

# Model training

In [7]:
policies_dict = {
    0 : "exclusive",
    1 : "less_exclusive",
    2 : "less_inclusive",
    3 : "inclusive",
    4 : "siblings",
    5 : "exclusive_siblings"
}

rename_dict = policies_dict
rename_dict[6] = "parent_node"

In [8]:
for i in policies_dict:
    # Initialize timer
    starting_time = time.time()

    # Initialize model
    random_forest = RandomForestClassifier(random_state=42)
    # Initialize hierarchical classifier with model
    hierarchical_classifier = hiclass.LocalClassifierPerNode(binary_policy=policies_dict[i],n_jobs=cpu_count(), local_classifier=random_forest)
    # Train hierarchical classifier
    hierarchical_classifier.fit(X_train, y_train)
    
    # Store training time
    training_time = time.time() - starting_time
    training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

    # Predict on train and test
    y_pred_train = hierarchical_classifier.predict(X_train)
    y_pred_test = hierarchical_classifier.predict(X_test)

    # Train_df
    train_df = pd.DataFrame(y_train)
    train_df["prediction"] = pd.Series(y_pred_train.tolist(), index = train_df.index)
    train_df["precision"] = train_df.apply(lambda x: assign_precision(x), axis=1)
    train_df["recall"] = train_df.apply(lambda x: assign_recall(x), axis=1)
    train_df["f1"] = train_df.apply(lambda x: assign_f1(x), axis=1)
    train_df["training_time"] = training_time
    
    # Test_df
    test_df = pd.DataFrame(y_test)
    test_df["prediction"] = pd.Series(y_pred_test.tolist(), index = test_df.index)
    test_df["precision"] = test_df.apply(lambda x: assign_precision(x), axis=1)
    test_df["recall"] = test_df.apply(lambda x: assign_recall(x), axis=1)
    test_df["f1"] = test_df.apply(lambda x: assign_f1(x), axis=1)
    test_df["training_time"] = training_time

    # Save results
    results_train = [
        len(train_df["f1"][train_df["f1"] == 0.0]),
        hiclass.metrics.precision(y_train, y_pred_train),
        hiclass.metrics.recall(y_train, y_pred_train),
        hiclass.metrics.f1(y_train, y_pred_train)
        ]

    train_comparison_df.loc[len(train_comparison_df)] = results_train

    results_test = [
        len(test_df["f1"][test_df["f1"] == 0.0]),
        hiclass.metrics.precision(y_test, y_pred_test),
        hiclass.metrics.recall(y_test, y_pred_test),
        hiclass.metrics.f1(y_test, y_pred_test)
        ]

    test_comparison_df.loc[len(test_comparison_df)] = results_test
    print(training_time)

00:44:21


KeyboardInterrupt: 

In [None]:
# Initialize timer
starting_time = time.time()

# Initialize model
random_forest = RandomForestClassifier(random_state=42)
# Initialize hierarchical classifier with model
hierarchical_classifier = hiclass.LocalClassifierPerParentNode(n_jobs=cpu_count(), local_classifier=random_forest)
# Train hierarchical classifier
hierarchical_classifier.fit(X_train, y_train)

# Store training time
training_time = time.time() - starting_time
training_time = time.strftime("%H:%M:%S", time.gmtime(training_time))

# Predict on train and test
y_pred_train = hierarchical_classifier.predict(X_train)
y_pred_test = hierarchical_classifier.predict(X_test)

# Train_df
train_df = pd.DataFrame(y_train)
train_df["prediction"] = pd.Series(y_pred_train.tolist(), index = train_df.index)
train_df["precision"] = train_df.apply(lambda x: assign_precision(x), axis=1)
train_df["recall"] = train_df.apply(lambda x: assign_recall(x), axis=1)
train_df["f1"] = train_df.apply(lambda x: assign_f1(x), axis=1)
train_df["training_time"] = training_time

# Test_df
test_df = pd.DataFrame(y_test)
test_df["prediction"] = pd.Series(y_pred_test.tolist(), index = test_df.index)
test_df["precision"] = test_df.apply(lambda x: assign_precision(x), axis=1)
test_df["recall"] = test_df.apply(lambda x: assign_recall(x), axis=1)
test_df["f1"] = test_df.apply(lambda x: assign_f1(x), axis=1)
test_df["training_time"] = training_time

# Save results
results_train = [
    len(train_df["f1"][train_df["f1"] == 0.0]),
    hiclass.metrics.precision(y_train, y_pred_train),
    hiclass.metrics.recall(y_train, y_pred_train),
    hiclass.metrics.f1(y_train, y_pred_train)
    ]

train_comparison_df.loc[len(train_comparison_df)] = results_train

results_test = [
    len(test_df["f1"][test_df["f1"] == 0.0]),
    hiclass.metrics.precision(y_test, y_pred_test),
    hiclass.metrics.recall(y_test, y_pred_test),
    hiclass.metrics.f1(y_test, y_pred_test)
    ]

test_comparison_df.loc[len(test_comparison_df)] = results_test

In [9]:
train_comparison_df.rename(index=rename_dict)

Unnamed: 0,train_failures,train_precision,train_recall,train_f1
exclusive,28.0,0.932178,0.996824,0.963418


In [10]:
test_comparison_df.rename(index=rename_dict)

Unnamed: 0,test_failures,test_precision,test_recall,test_f1
exclusive,400.0,0.843235,0.904365,0.872731
