# Notebook showing the usage of HMLC

## Preliminaries

In [1]:
import pandas as pd, time, os
from hmlc import HMLC

In [2]:
# read and clean the data
data_path = r"C:\Users\sinori\Documents\OneDrive - Microsoft\Misc\Multi level hierarchical classification\hmlc_github_repo\dstoolkit-hierarchical-multilabel-classification\datasets\Amazon_reviews"
_data = os.path.join(data_path, "amazon_reviews_train.csv")
dt = pd.read_csv(_data)
dt.head(3)

Unnamed: 0,x_productId,x_Title,x_userId,x_Helpfulness,x_Score,x_Time,x_Text,y_Cat1,y_Cat2,y_Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown


In [3]:
print(f'Rows before dropping null rows: {len(dt):,}')
dt = dt.dropna()
print(f'Rows after dropping null rows: {len(dt):,}')

# truncate data to expedite the demo
dt_train = dt[:5000]
dt_val = dt[5000:6001]
input_col_list = ['x_productId', 'x_Title', 'x_userId', 'x_Helpfulness',
                    'x_Score', 'x_Time', 'x_Text']
output_col_list = ['y_Cat1', 'y_Cat2', 'y_Cat3']

Rows before dropping null rows: 40,000
Rows after dropping null rows: 39,984


## Find the best approach

In [4]:
# find the best_approach model
hmlc_obj = HMLC()
t0 = time.time()
best_approach = hmlc_obj.fit(dt_train[input_col_list], dt_train[output_col_list])
time_elapsed = best_approach.tl(t0)
print(f"Total time taken to train models : {time_elapsed}")

INFO:root:Number of features in vectorizer = 4663
INFO:root:Number of features in vectorizer = 5000
INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.5380
INFO:root:Shape of training data set = (3999, 7679);test_data_set = (1000, 7679)
INFO:root:Accuracy of model =  0.1850
INFO:root:Shape of training data set = (3999, 7680);test_data_set = (1000, 7680)
INFO:root:Accuracy of model =  0.1880



Accuracy with Chained Model Approach using RandomForestClassifier(max_depth=3, random_state=13) = 0.0450
Hamming loss in Chained Model Approach using RandomForestClassifier(max_depth=3, random_state=13) = 0.7023


INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.5380
INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.1810
INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.1580



Accuracy with Independent Model Approach using RandomForestClassifier(max_depth=3, random_state=13) = 0.0330
Hamming loss in Independent Model Approach using RandomForestClassifier(max_depth=3, random_state=13) =  0.7077


INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.1540



Accuracy with Powerset Model using RandomForestClassifier(max_depth=3, random_state=13) =  0.1540
Hamming loss in Powerset Model using RandomForestClassifier(max_depth=3, random_state=13) =  0.6777


INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.7280
INFO:root:Shape of training data set = (3999, 7679);test_data_set = (1000, 7679)
INFO:root:Accuracy of model =  0.5240
INFO:root:Shape of training data set = (3999, 7680);test_data_set = (1000, 7680)
INFO:root:Accuracy of model =  0.5350



Accuracy with Chained Model Approach using ExtraTreesClassifier(n_estimators=50, random_state=13) = 0.3530
Hamming loss in Chained Model Approach using ExtraTreesClassifier(n_estimators=50, random_state=13) = 0.4777


INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.7280
INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.4540
INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.4050



Accuracy with Independent Model Approach using ExtraTreesClassifier(n_estimators=50, random_state=13) = 0.3250
Hamming loss in Independent Model Approach using ExtraTreesClassifier(n_estimators=50, random_state=13) =  0.4710


INFO:root:Shape of training data set = (3999, 7678);test_data_set = (1000, 7678)
INFO:root:Accuracy of model =  0.4030



Accuracy with Powerset Model using ExtraTreesClassifier(n_estimators=50, random_state=13) =  0.4030
Hamming loss in Powerset Model using ExtraTreesClassifier(n_estimators=50, random_state=13) =  0.4640


INFO:root:Execution terminated due to time limit at 129.4166557788849, 159.4166557788849


Total time taken to train models : 0:02:47


## Inferring with the model

In [5]:
predictions = best_approach.predict(dt_val[input_col_list])
pred_out_cols = [col+'_pred' for col in output_col_list]
predictions.df_pred.columns = pred_out_cols
y_pred = predictions.df_pred
dt_val = dt_val.reset_index(drop=True)
dataset_pred = pd.concat([dt_val, y_pred], axis=1)
dataset_pred.head(3)

Unnamed: 0,x_productId,x_Title,x_userId,x_Helpfulness,x_Score,x_Time,x_Text,y_Cat1,y_Cat2,y_Cat3,y_Cat1_pred,y_Cat2_pred,y_Cat3_pred
0,B000056OWT,"Safety 1st Crystal Clear Baby Monitor, White",AGB7JDXAH08GR,6/6,4.0,1128816000,I bought this specifically because it DID NOT ...,baby products,safety,monitors,toys games,games,board games
1,B0003069ZC,Batman Begins Batmobile Vehicle,A271PPZMLZU454,9/10,5.0,1128816000,All of the kids love this batmobile. Unfortuna...,toys games,action toy figures,accessories,toys games,action toy figures,figures
2,B00006K110,Conair QWCS Quick Wrap Hair Art Styling Kit,A3LB2K5G257DNU,7/7,5.0,1128816000,"I had seen this advertised on tv, and thought ...",beauty,hair care,styling tools,pet supplies,cats,litter housebreaking


In [6]:
print(f"The best estimator is: {predictions.best_approach_dict['model']}")
print(f"The best approach is: {predictions.best_approach_dict['approach']}")

The best estimator is: ExtraTreesClassifier(n_estimators=50, random_state=13)
The best approach is: Powerset Models


## Explore methods and attributes

In [7]:
print(f'{best_approach.abbr_dict = }')
print(f'{best_approach.additional_colms}')
print('The above information is useful in cases where one uses HMLC() to quickly identify the best \
       baseline model and then improve it by tuning hyper-parameters.')
print(f'{best_approach.estimators_ = }')
print(f'{best_approach.max_features = }')
print(f'{best_approach.ngram = }')
print(f'{best_approach.stop_words = }')
print(f'{best_approach.time_limit = }')
print(f'{best_approach.token_pattern = }')
print(f'{best_approach.validation_split = }')

best_approach.abbr_dict = {}
[]
The above information is useful in cases where one uses HMLC() to quickly identify the best        baseline model and then improve it by tuning hyper-parameters.
best_approach.estimators_ = [RandomForestClassifier(max_depth=3, random_state=13), ExtraTreesClassifier(n_estimators=50, random_state=13), GaussianNB()]
best_approach.max_features = 5000
best_approach.ngram = (1, 1)
best_approach.stop_words = 'english'
best_approach.time_limit = 30
best_approach.token_pattern = '([a-zA-Z0-9/+-]{1,})'
best_approach.validation_split = 0.2


In [8]:
# Time logger computes the elapsed time
t0 = time.time()
time.sleep(1.25)
print(f'Elapsed: {best_approach.tl(t0)}')

Elapsed: 0:00:01


## Predict probability
For independent models works exactly like model.predict_proba of any sklearn model
In case of other two, a dictionary is returnd

In [9]:
proba = best_approach.predict_proba(dt_val[input_col_list])
print(proba)

[[0.   0.   0.   ... 0.08 0.   0.  ]
 [0.   0.   0.   ... 0.12 0.02 0.04]
 [0.06 0.   0.   ... 0.08 0.   0.  ]
 ...
 [0.02 0.   0.   ... 0.1  0.02 0.  ]
 [0.02 0.   0.   ... 0.   0.02 0.  ]
 [0.   0.   0.   ... 0.02 0.   0.  ]]


In [10]:
proba.shape

(1001, 275)

In [11]:
len(dt_val)

1001

proba shows the probability of each row of dt_val belonging to the 275 classes (all possible combinations in the given data)