# Hierarchical classification

In [3]:
import pandas as pd
import numpy as np
import text_normalizer
import hiclass
from sklearn import metrics
from utils import vectorizer
from scripts.build_df import build_df
from scripts.decode_id import decode_id
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from text_normalizer import tokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from os import cpu_count
from hiclass import LocalClassifierPerNode

In [4]:
y = build_df(json_path='data/products.json', threshold=0, preprocessed_csv='data/normalized_data.csv')['path']

In [5]:
df = pd.read_csv('data/products_v1.csv', index_col=0)

def normalization(input):
    output = text_normalizer.normalize_corpus(
        input,
        html_stripping=True,
        contraction_expansion=True,
        accented_char_removal=True,
        text_lower_case=True,
        text_stemming=True,
        text_lemmatization=False,
        special_char_removal=True,
        remove_digits=False,
        stopword_removal=True,
        stopwords=text_normalizer.stopword_list
    )       
    return output

df['name'] = normalization(df['name'].apply(str))

In [6]:
X = df['name']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20, 
    random_state=42,
    #stratify = y
)

In [8]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

### Hierarchical classification

Classifier per parent node

In [9]:
logreg = LogisticRegression(max_iter=1000)
classifier = hiclass.LocalClassifierPerParentNode(n_jobs=cpu_count(), local_classifier=logreg)

In [10]:
classifier.fit(X_train, y_train)

In [11]:
from hiclass.metrics import precision, recall, f1

In [24]:
y_pred_train = classifier.predict(X_train)
print("Train")
print(precision(y_train, y_pred_train))
print(recall(y_train, y_pred_train))
print(f1(y_train, y_pred_train))

Train
0.6796279069767441
0.8409567205410726
0.751734096868159


In [23]:
y_pred_test = classifier.predict(X_test)
print("Test")
print(precision(y_test, y_pred_test))
print(recall(y_test, y_pred_test))
print(f1(y_test, y_pred_test))

Test
0.6708156914757946
0.8296529968454258
0.7418272078526501


Classifier parent node threshold 100

In [14]:
y_100 = build_df(json_path='data/products.json', threshold=100, preprocessed_csv='data/normalized_data.csv')['path']     

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_100,
    test_size=0.20, 
    random_state=42,
    #stratify = y
)

In [16]:
tfid_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 1)) 
X_train = tfid_vectorizer.fit_transform(X_train)
X_test = tfid_vectorizer.transform(X_test)

In [17]:
logreg = LogisticRegression(max_iter=1000)
classifier_100 = hiclass.LocalClassifierPerParentNode(n_jobs=cpu_count(), local_classifier=logreg)
classifier_100.fit(X_train, y_train)

In [18]:
from hiclass.metrics import precision, recall, f1
y_pred_train = classifier_100.predict(X_train)
print("Train")
print(precision(y_train, y_pred_train))
print(recall(y_train, y_pred_train))
print(f1(y_train, y_pred_train))
y_pred_test = classifier_100.predict(X_test)
print("Test")
print(precision(y_test, y_pred_test))
print(recall(y_test, y_pred_test))
print(f1(y_test, y_pred_test))

Train
0.7807301473128971
0.8788699075985139
0.8268983004641887
Test
0.7640377136782027
0.8614826498422713
0.8098394199546283


Classifier per node threshold 100

In [19]:
classifier_per_node_100 = hiclass.LocalClassifierPerNode(n_jobs=cpu_count(), local_classifier=logreg)
classifier_per_node_100.fit(X_train, y_train)

In [20]:
y_pred_train_per_node = classifier_per_node_100.predict(X_train)
print("Train")
print(precision(y_train, y_pred_train))
print(recall(y_train, y_pred_train))
print(f1(y_train, y_pred_train))

Train
0.7807301473128971
0.8788699075985139
0.8268983004641887


In [21]:
y_pred_test_per_node = classifier_per_node_100.predict(X_test)
print("Test")
print(precision(y_test, y_pred_test))
print(recall(y_test, y_pred_test))
print(f1(y_test, y_pred_test))

Test
0.7640377136782027
0.8614826498422713
0.8098394199546283
