# Multi-label classification using `AutoGluon`

- https://auto.gluon.ai/stable/index.html
- [Documentation - Text Prediction - Customisation](https://auto.gluon.ai/stable/tutorials/text_prediction/customization.html)

## (0) Import libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
import os
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
np.random.seed(123)

## (1) Data preparation

In [None]:
df = pd.read_csv('data/raw_train_test_data.csv')
df.drop(df.columns[0],axis=1,inplace=True)
df.head()

### `Sector`

In [None]:
df_sector = df.dropna(subset=['Sector'])
df_sector['Sector'] = df_sector['Sector'].replace(['Educaiton'],'Education')

In [None]:
df_sector = df_sector.drop(['Category', 'Subcategory', 'Tag'], axis=1)
df_sector.head()

In [None]:
df_sector['Sector'].value_counts().plot.bar()
plt.show()

In [None]:
df_sector = df_sector.drop_duplicates(subset=['Case Details'])
df_sector['Sector'].value_counts().plot.bar()

In [None]:
df_sector.shape

In [None]:
# prepare for training
df_sector.columns = ['sentence','label']
df_sector = df_sector[['label','sentence']]
codes, uniques = pd.factorize(df_sector['label'])
df_sector['label'] = codes

# print labels and codes
for x,y in zip(df_sector['label'].unique(), uniques.values):
    print(f'{x} => "{y}"')
    
# save to disk
df_sector.to_csv('data/data_sector.csv', index=False)

### `Category`

In [None]:
df_category = df.dropna(subset=['Category'])

In [None]:
df_category = df_category.drop(['Sector', 'Subcategory', 'Tag'], axis=1)
df_category.head()

In [None]:
# prepare for training
df_category.columns = ['sentence','label']
df_category = df_category[['label','sentence']]
codes, uniques = pd.factorize(df_category['label'])
df_category['label'] = codes

# print labels and codes
for x,y in zip(df_category['label'].unique(), uniques.values):
    print(f'{x} => "{y}"')
    
# save to disk
df_category.to_csv('data/data_category.csv', index=False)

In [None]:
df_category.head()

### `Sub-category`

In [None]:
df_subcategory = df.dropna(subset=['Subcategory'])
df_subcategory = df_subcategory.drop(['Sector', 'Tag'], axis=1)
df_subcategory.head()

In [None]:
# prepare for training
df_subcategory.columns = ['sentence1','sentence2','label']
df_subcategory = df_subcategory[['label','sentence1','sentence2']]
codes, uniques = pd.factorize(df_subcategory['label'])
df_subcategory['label'] = codes

# print labels and codes
for x,y in zip(df_subcategory['label'].unique(), uniques.values):
    print(f'{x} => "{y}"')
    
# save to disk
df_subcategory.to_csv('data/data_subcategory.csv', index=False)

In [None]:
df_subcategory.head()

## (2) Model training (`Sector`)

### Approach A: Using `TabularPredictor`
- https://auto.gluon.ai/scoredebugweight/api/autogluon.task.html#autogluon.tabular.TabularPredictor

In [None]:
from autogluon.tabular import TabularPredictor
time_limit = 1 * 60 * 60
pred_sector = TabularPredictor(label='label', path='pred_sector')
pred_sector.fit(df_sector, hyperparameters='multimodal', time_limit=time_limit)

In [None]:
leaderboard = pred_sector.leaderboard(df_sector)
leaderboard.to_csv('data/leaderboard.csv',index=False)
leaderboard.head()

### (Optional) Use `StackEnsemble`
- https://auto.gluon.ai/stable/tutorials/tabular_prediction/tabular-multimodal-text-others.html#improve-the-performance-with-stack-ensemble

In [None]:
# time_limit = 1 * 60 * 60
# pred_sector_ensemble = TabularPredictor(label='Sector', path='predict_sector_stack_ensemble')
# pred_sector_ensemble.fit(df_sector, hyperparameters='multimodal', presets='best_quality', time_limit=time_limit)

## Approach B: Using `TextPredictor`
- https://auto.gluon.ai/stable/tutorials/tabular_prediction/tabular-multimodal-text-others.html#improve-the-performance-with-stack-ensemble
- https://auto.gluon.ai/stable/tutorials/text_prediction/customization.html

`TextPredictor` provides several simple preset configurations. Let’s take a look at the available presets.

In [None]:
from autogluon.text.text_prediction.presets import list_text_presets
list_text_presets(verbose=True)

In [None]:
# split train/test data
train_data = df_sector.sample(frac=0.9, random_state=42)
test_data = df_sector.drop(train_data.index)
label = "label"
y_test = test_data[label]
X_test = test_data.drop(columns=[label])

In [None]:
print(train_data.shape)
print(test_data.shape)

### Model training

In [None]:
from autogluon.text import TextPredictor
time_limit = 1 * 60 * 60
pred_sector_textpred = TextPredictor(eval_metric="acc", label="label")
pred_sector_textpred.fit(
    train_data=train_data,
    presets="medium_quality_faster_train",
    time_limit=time_limit,
)

### Evaluate against `test_data`

In [None]:
pred_sector_textpred.evaluate(test_data, metrics=["f1", "acc"])

## (4) Model inference against `validation` data

In [None]:
validation_df = pd.read_excel('data/validation_set.xlsx')
val_data = validation_df.drop(columns=['Case Category','Gender','District'])
val_data.columns = ['sentence']

In [None]:
output = pred_sector.predict(val_data)
output.head()

In [None]:
output_df = val_data.copy()
output_df['predicted'] = output
output_df.head()

In [None]:
output_df.to_csv('data/output.csv',index=False)