In [None]:
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map = ['#16a085', '#e8f6f3', '#d0ece7', '#a2d9ce', '#73c6b6', '#45b39d', 
                        '#16a085', '#138d75', '#117a65', '#0e6655', '#0b5345']

prompt = color_map[-1]
main_color = color_map[0]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = ''' 

div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
    border-top: 2px solid %s; /* light orange */
}

div.input_prompt {
color: %s; /* light blue */
}

div.output_prompt {
color: %s; /* strong orange */
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s; /* light orange */
}

.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}

.edit_mode div.cell.selected {
border-color: %s; /* light orange */

}
'''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()

<img src="https://raw.githubusercontent.com/AILab-MLTools/LightAutoML/master/imgs/LightAutoML_logo_big.png" alt="LightAutoML logo" style="width:70%;"/>

# LightAutoML baseline

Official LightAutoML github repository is [here](https://github.com/AILab-MLTools/LightAutoML). 

### Do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it using the button below - one click for you, great pleasure for us ☺️ 

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=AILab-MLTools&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)

## 0. Prerequisites

### 0.0. install LightAutoML

In [None]:
%%capture
!pip install -U lightautoml

### 0.1. Import libraries

Here we will import the libraries we use in this kernel:
- Standard python libraries for timing, working with OS etc.
- Essential python DS libraries like numpy, pandas, scikit-learn and torch (the last we will use in the next cell)
- LightAutoML modules: presets for AutoML, task and report generation module

In [None]:
# Essential DS libraries
import numpy as np
import pandas as pd
from pathlib import Path
import torch

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

pd.set_option('display.max_columns', None)

### 0.2. Constants

Here we setup the constants to use in the kernel:
- `N_THREADS` - number of vCPUs for LightAutoML model creation
- `RANDOM_STATE` - random seed for better reproducibility
- `TEST_SIZE` - houldout data part size 
- `TIMEOUT` - limit in seconds for model to train
- `TARGET_NAME` - target column name in dataset
- `N_FOLDS` - number folds for training

In [None]:
N_THREADS = 4
RANDOM_STATE = 21
# TEST_SIZE = 0.2
TIMEOUT = 0.5 * 3600
TARGET_NAME = 'quality'
N_FOLDS = 5

### 0.3. Imported models setup

For better reproducibility fix numpy random seed with max number of threads for Torch (which usually try to use all the threads on server):

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

### 0.4. Data loading
Let's check the data we have:

In [None]:
INPUT_DIR = Path('/kaggle/input/playground-series-s3e5/')

In [None]:
train_data = pd.read_csv(f'{INPUT_DIR}/train.csv')
train_data['is_generated'] = 1
print(train_data.shape)
train_data.head()

In [None]:
train_data.info(verbose=False)

In [None]:
test_data = pd.read_csv(f'{INPUT_DIR}/test.csv')
test_data['is_generated'] = 1
print(test_data.shape)
test_data.head()

In [None]:
test_data.info(verbose=False)

In [None]:
submission = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv')
print(submission.shape)
submission.head()

### 0.5. Feature engineering
Let's make same new features and/or data:

In [None]:
extra_data = pd.read_csv('/kaggle/input/wine-quality-dataset/WineQT.csv')
extra_data['is_generated'] = 0
print(extra_data.shape)
extra_data.head()

In [None]:
train_data = pd.concat([train_data, extra_data], axis=0).drop_duplicates()
print(train_data.shape)
train_data.head()

In [None]:
train_data.info(verbose=False)

Thanks to [kotrying](https://www.kaggle.com/code/kotrying/ps-s3e5-using-polars/notebook)'s notebook for the new features:

In [None]:
def feature_engineering(data):
    # From https://www.kaggle.com/code/kotrying/ps-s3e5-using-polars/notebook
    data['log1p residual sugar'] = np.log1p(data['residual sugar'])
    data['citric acid per alcohol'] = data['citric acid'] / data['alcohol']
    data['citric acid type'] = data['citric acid'].apply(lambda x: 0 if x==0 else (1 if x==0.49 else 2))
    data['pH round1'] = round(data['pH'], 1)
    ph_dict = train_data.groupby(by='pH round1')['alcohol'].mean().to_dict()
    data['alcohol mean groupby pH'] = data['pH round1'].map(ph_dict)
    
    # From https://www.kaggle.com/competitions/playground-series-s3e5/discussion/383685
    data['acidity_ratio'] = data['fixed acidity'] / data['volatile acidity']
    data['free_sulfur/total_sulfur'] = data['free sulfur dioxide'] / data['total sulfur dioxide']
    data['sugar/alcohol'] = data['residual sugar'] / data['alcohol']
    data['alcohol/density'] = data['alcohol'] / data['density']
    data['total_acid'] = data['fixed acidity'] + data['volatile acidity'] + data['citric acid']
    data['sulphates/chlorides'] = data['sulphates'] / data['chlorides']
    data['bound_sulfur'] = data['total sulfur dioxide'] - data['free sulfur dioxide']
    data['alcohol/pH'] = data['alcohol'] / data['pH']
    data['alcohol/acidity'] = data['alcohol'] / data['total_acid']
    data['alkalinity'] = data['pH'] + data['alcohol']
    data['mineral'] = data['chlorides'] + data['sulphates'] + data['residual sugar']
    data['density/pH'] = data['density'] / data['pH']
    data['total_alcohol'] = data['alcohol'] + data['residual sugar']
    
    # From https://www.kaggle.com/competitions/playground-series-s3e5/discussion/382698
    data['acid/density'] = data['total_acid']  / data['density']
    data['sulphate/density'] = data['sulphates']  / data['density']
    data['sulphates/acid'] = data['sulphates'] / data['volatile acidity']
    data['sulphates*alcohol'] = data['sulphates'] * data['alcohol']

    return data

In [None]:
%%time 

for data in [train_data, test_data]:
    data = feature_engineering(data)

In [None]:
sc = RobustScaler()

sc_features = [feature for feature in test_data.columns if feature not in ['Id']]
train_data[sc_features] = sc.fit_transform(train_data[sc_features])
test_data[sc_features] = sc.transform(test_data[sc_features])

In [None]:
train_data.head()

In [None]:
train_data.info(verbose=False)

# 1. Task definition

### 1.1. Task type

On the cell below we create Task object - the class to setup what task LightAutoML model should solve with specific loss and metric if necessary (more info can be found [here](https://lightautoml.readthedocs.io/en/latest/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task) in our documentation):

In [None]:
task = Task(name = 'multiclass',
            metric = 'accuracy',
#             loss = 'f1'
           )

### 1.2. Feature roles setup
To solve the task, we need to setup columns roles. The **only role you must setup is target role**, everything else (drop, numeric, categorical, group, weights etc.) is up to user - LightAutoML models have automatic columns typization inside:

In [None]:
roles = {'target': TARGET_NAME,
         'drop': ['Id']
         }

### 1.3. LightAutoML model creation - TabularAutoML preset

In next the cell we are going to create LightAutoML model with `TabularAutoML` class - preset with default model structure like in the image below:

<img src="https://github.com/AILab-MLTools/LightAutoML/raw/master/imgs/tutorial_blackbox_pipeline.png" alt="TabularAutoML preset pipeline" style="width:75%;"/>

in just several lines. Let's discuss the params we can setup:
- `task` - the type of the ML task (the only **must have** parameter)
- `timeout` - time limit in seconds for model to train
- `cpu_limit` - vCPU count for model to use
- `reader_params` - parameter change for Reader object inside preset, which works on the first step of data preparation: automatic feature typization, preliminary almost-constant features, correct CV setup etc. For example, we setup `n_jobs` threads for typization algo, `cv` folds and `random_state` as inside CV seed.

**Important note**: `reader_params` key is one of the YAML config keys, which is used inside `TabularAutoML` preset. [More details](https://github.com/AILab-MLTools/blob/master/lightautoml/automl/presets/tabular_config.yml) on its structure with explanation comments can be found on the link attached. Each key from this config can be modified with user settings during preset object initialization. To get more info about different parameters setting (for example, ML algos which can be used in `general_params->use_algos`) please take a look at our [article on TowardsDataScience](https://towardsdatascience.com/lightautoml-preset-usage-tutorial-2cce7da6f936).

Moreover, to receive the automatic report for our model we will use `ReportDeco` decorator and work with the decorated version in the same way as we do with usual one. 

In [None]:
automl = TabularAutoML(task = task,
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]}
                      )

# 2. AutoML training

To run autoML training use fit_predict method:
- `train_data` - Dataset to train.
- `roles` - Roles dict.
- `verbose` - Controls the verbosity: the higher, the more messages.
        <1  : messages are not displayed;
        >=1 : the computation process for layers is displayed;
        >=2 : the information about folds processing is also displayed;
        >=3 : the hyperparameters optimization process is also displayed;
        >=4 : the training process for every algorithm is displayed;

Note: out-of-fold prediction is calculated during training and returned from the fit_predict method

In [None]:
%%time

oof_pred = automl.fit_predict(train_data, roles=roles, verbose=3)
print(f'oof_pred:\n{oof_pred}\nShape = {oof_pred.shape}')

In [None]:
%%time

fast_fi = automl.get_feature_scores('fast')
fast_fi.set_index('Feature')['Importance'].plot.bar(figsize=(20, 10), grid=True)

In [None]:
preds = train_data[['Id', TARGET_NAME]]
preds.head()

In [None]:
for i in range(6):
    preds['pred_' + str(i)] = oof_pred.data[:, i]
preds

Assign classes by maximum class probability:

In [None]:
OOFs = np.argmax(preds[['pred_' + str(i) for i in range(6)]].values, axis = 1)
OOFs

Let’s see classification accuracy on train:

In [None]:
accuracy = (OOFs == preds[TARGET_NAME].map(automl.reader.class_mapping)).mean()
print(f'Out-of-fold accuracy: {accuracy}')

Also to estimate the quality of classification, we can use the confusion matrix:

In [None]:
cf_matrix = confusion_matrix(preds[TARGET_NAME].map(automl.reader.class_mapping),
                             OOFs)

plt.figure(figsize = (10, 10))

ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues', fmt = 'd')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

inverse_class_mapping = {y: x for x,y in automl.reader.class_mapping.items()}
labels = [inverse_class_mapping[i] for i in range(len(inverse_class_mapping))]
ax.xaxis.set_ticklabels(labels, rotation = 90)
ax.yaxis.set_ticklabels(labels, rotation = 0)

plt.show()

# 3. Predict and save
Predict and save submissions to .csv

In [None]:
%%time

test_pred = automl.predict(test_data)
print(f'Prediction for test data:\n{test_pred}\nShape = {test_pred.shape}')

In [None]:
sub = submission[['Id']]
for i in range(6):
    sub['pred_' + str(i)] = test_pred.data[:,i]
sub

In [None]:
TEs = pd.Series(np.argmax(sub[['pred_' + str(i) for i in range(6)]].values, axis = 1)).map(inverse_class_mapping)
TEs

In [None]:
sub[TARGET_NAME] = TEs
sub[['Id', TARGET_NAME]].to_csv('LightAutoML.csv', index=False)
sub[['Id', TARGET_NAME]].head()

# Additional materials

- [Official LightAutoML github repo](https://github.com/AILab-MLTools/LightAutoML)
- [LightAutoML documentation](https://lightautoml.readthedocs.io/en/latest)
- [LightAutoML tutorials](https://github.com/AILab-MLTools/LightAutoML/tree/master/examples/tutorials)
- LightAutoML course:
    - [Part 1 - general overview](https://ods.ai/tracks/automl-course-part1) 
    - [Part 2 - LightAutoML specific applications](https://ods.ai/tracks/automl-course-part2)
    - [Part 3 - LightAutoML customization](https://ods.ai/tracks/automl-course-part3)
- [OpenDataScience AutoML benchmark leaderboard](https://ods.ai/competitions/automl-benchmark/leaderboard)

### If you still like the notebook, do not forget to put upvote for the notebook and the ⭐️ for github repo if you like it using the button below - one click for you, great pleasure for us ☺️

In [None]:
s = '<iframe src="https://ghbtns.com/github-btn.html?user=sb-ai-lab&repo=LightAutoML&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="LightAutoML GitHub"></iframe>'
HTML(s)