In [None]:
import pandas as pd 
import numpy as np

import os

import shutil
from pprint import pprint
import logging

from ludwig.api import LudwigModel

## Receive data for training

In [None]:
train_df = pd.read_csv('./data/winequalityN.csv')
train_df['quality'] = train_df['quality'].apply(str)
train_df.shape

In [None]:
# Replace white space in column names with underscore
new_col = []
for i in range(len(train_df.columns)):
    new_col.append(train_df.columns[i].replace(' ', '_'))
    
train_df.columns = new_col

In [None]:
train_df.head()

In [None]:
train_df.describe().T

In [None]:
train_df.dtypes

In [None]:
train_df['quality'].value_counts().sort_index()

In [None]:
cols = list(set(train_df.columns) - set(['quality']))
features = train_df[cols]

#extract categorical features
categorical_features = []
for p in features:
    if train_df[p].dtype == 'object':
        categorical_features.append(p)
        
print("categorical features:", categorical_features, '\n')

# get numerical features
numerical_features = list(set(features) - set(categorical_features))

print("numerical features:", numerical_features, "\n")

In [None]:
for feature in categorical_features:
    print(f"# of distinct values in categorical feature '{feature}' : {train_df[feature].nunique()}")

## Create Ludwig Config

In [None]:
# template for config
config = {'input_features':[], 'output_features': [], 'trainer':{}}

# setup input features for categorical features
for p in categorical_features:
    a_feature = {
        'name': p.replace(' ','_'), 
        'type': 'category'
    }
    config['input_features'].append(a_feature)

# setup input features for numerical features
for p in numerical_features:
    a_feature = {
        'name': p.replace(' ', '_'), 
        'type': 'number'
    }
    config['input_features'].append(a_feature)

# set up output variable
config['output_features'].append({'name': 'quality', 'type':'category'})

# set default preprocessing and encoder for numerical features
config['defaults'] = {
    'number': {
        'preprocessing': {
            'missing_value_strategy': 'fill_with_mean', 
            'normalization': 'zscore'
        },
        'encoder': {
            'type': 'dense',
            'num_layers': 2
        },
    },
    'category': {
        'encoder': {
            'type': 'sparse'
        },
        'decoder': {
            'top_k': 2
        },
        'loss': {
            'confidence_penalty': 0.1  
        }
    }
}

# set up trainer
config['trainer'] = {'epochs': 5}

In [None]:
pprint(config, indent=2)

## Initialize and Train LudwigModel

In [None]:
model = LudwigModel(config, backend = 'local', logging_level = logging.INFO)

### Inspecting Config After Model Initialization

In [None]:
pprint(model.config['input_features'], indent=2)

In [None]:
pprint(model.config['output_features'], indent=2)

In [None]:
eval_stats, train_stats, _, _ = model.experiment(
    dataset = train_df,
    experiment_name = 'wine_quality'
)

## Cleanup

In [None]:
try:
    shutil.rmtree('./results')
    items = os.listdir('./')
    for item in items:
        if item.endswith(".hdf5") or item.endswith(".json") or item == '.lock_preprocessing':
            os.remove(os.path.join('./', item))
except Exception as e:
    pass 