In [1]:
import pandas as pd 
import numpy as np

import os

import shutil
from pprint import pprint
import logging

from ludwig.api import LudwigModel

  from .autonotebook import tqdm as notebook_tqdm


torchtext>=0.13.0 is not installed, so the following tokenizers are not available: {'bert'}


## Receive data for training

In [3]:
train_df = pd.read_csv('./data/winequalityN.csv')
train_df['quality'] = train_df['quality'].apply(str)
train_df.shape

(6497, 13)

In [4]:
# Replace white space in column names with underscore
new_col = []
for i in range(len(train_df.columns)):
    new_col.append(train_df.columns[i].replace(' ', '_'))
    
train_df.columns = new_col

In [5]:
train_df.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,6487.0,7.216579,1.29675,3.8,6.4,7.0,7.7,15.9
volatile_acidity,6489.0,0.339691,0.164649,0.08,0.23,0.29,0.4,1.58
citric_acid,6494.0,0.318722,0.145265,0.0,0.25,0.31,0.39,1.66
residual_sugar,6495.0,5.444326,4.758125,0.6,1.8,3.0,8.1,65.8
chlorides,6495.0,0.056042,0.035036,0.009,0.038,0.047,0.065,0.611
free_sulfur_dioxide,6497.0,30.525319,17.7494,1.0,17.0,29.0,41.0,289.0
total_sulfur_dioxide,6497.0,115.744574,56.521855,6.0,77.0,118.0,156.0,440.0
density,6497.0,0.994697,0.002999,0.98711,0.99234,0.99489,0.99699,1.03898
pH,6488.0,3.218395,0.160748,2.72,3.11,3.21,3.32,4.01
sulphates,6493.0,0.531215,0.148814,0.22,0.43,0.51,0.6,2.0


In [7]:
train_df.dtypes

type                     object
fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                  object
dtype: object

In [8]:
train_df['quality'].value_counts().sort_index()

3      30
4     216
5    2138
6    2836
7    1079
8     193
9       5
Name: quality, dtype: int64

In [9]:
cols = list(set(train_df.columns) - set(['quality']))
features = train_df[cols]

#extract categorical features
categorical_features = []
for p in features:
    if train_df[p].dtype == 'object':
        categorical_features.append(p)
        
print("categorical features:", categorical_features, '\n')

# get numerical features
numerical_features = list(set(features) - set(categorical_features))

print("numerical features:", numerical_features, "\n")

categorical features: ['type'] 

numerical features: ['residual_sugar', 'fixed_acidity', 'chlorides', 'alcohol', 'sulphates', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'citric_acid', 'pH', 'volatile_acidity', 'density'] 



In [10]:
for feature in categorical_features:
    print(f"# of distinct values in categorical feature '{feature}' : {train_df[feature].nunique()}")

# of distinct values in categorical feature 'type' : 2


## Create Ludwig Config

In [11]:
# template for config
config = {'input_features':[], 'output_features': [], 'trainer':{}}

# setup input features for categorical features
for p in categorical_features:
    a_feature = {
        'name': p.replace(' ','_'), 
        'type': 'category'
    }
    config['input_features'].append(a_feature)

# setup input features for numerical features
for p in numerical_features:
    a_feature = {
        'name': p.replace(' ', '_'), 
        'type': 'number'
    }
    config['input_features'].append(a_feature)

# set up output variable
config['output_features'].append({'name': 'quality', 'type':'category'})

# set default preprocessing and encoder for numerical features
config['defaults'] = {
    'number': {
        'preprocessing': {
            'missing_value_strategy': 'fill_with_mean', 
            'normalization': 'zscore'
        },
        'encoder': {
            'type': 'dense',
            'num_layers': 2
        },
    },
    'category': {
        'encoder': {
            'type': 'sparse'
        },
        'decoder': {
            'top_k': 2
        },
        'loss': {
            'confidence_penalty': 0.1  
        }
    }
}

# set up trainer
config['trainer'] = {'epochs': 5}

In [12]:
pprint(config, indent=2)

{ 'defaults': { 'category': { 'decoder': {'top_k': 2},
                              'encoder': {'type': 'sparse'},
                              'loss': {'confidence_penalty': 0.1}},
                'number': { 'encoder': {'num_layers': 2, 'type': 'dense'},
                            'preprocessing': { 'missing_value_strategy': 'fill_with_mean',
                                               'normalization': 'zscore'}}},
  'input_features': [ {'name': 'type', 'type': 'category'},
                      {'name': 'residual_sugar', 'type': 'number'},
                      {'name': 'fixed_acidity', 'type': 'number'},
                      {'name': 'chlorides', 'type': 'number'},
                      {'name': 'alcohol', 'type': 'number'},
                      {'name': 'sulphates', 'type': 'number'},
                      {'name': 'free_sulfur_dioxide', 'type': 'number'},
                      {'name': 'total_sulfur_dioxide', 'type': 'number'},
                      {'name': 'citric_acid'

## Initialize and Train LudwigModel

In [13]:
model = LudwigModel(config, backend = 'local', logging_level = logging.INFO)

### Inspecting Config After Model Initialization

In [14]:
pprint(model.config['input_features'], indent=2)

[ { 'column': 'type',
    'encoder': 'sparse',
    'name': 'type',
    'proc_column': 'type_mZFLky',
    'tied': None,
    'type': 'category'},
  { 'column': 'residual_sugar',
    'encoder': 'dense',
    'name': 'residual_sugar',
    'num_layers': 2,
    'proc_column': 'residual_sugar_mZFLky',
    'tied': None,
    'type': 'number'},
  { 'column': 'fixed_acidity',
    'encoder': 'dense',
    'name': 'fixed_acidity',
    'num_layers': 2,
    'proc_column': 'fixed_acidity_mZFLky',
    'tied': None,
    'type': 'number'},
  { 'column': 'chlorides',
    'encoder': 'dense',
    'name': 'chlorides',
    'num_layers': 2,
    'proc_column': 'chlorides_mZFLky',
    'tied': None,
    'type': 'number'},
  { 'column': 'alcohol',
    'encoder': 'dense',
    'name': 'alcohol',
    'num_layers': 2,
    'proc_column': 'alcohol_mZFLky',
    'tied': None,
    'type': 'number'},
  { 'column': 'sulphates',
    'encoder': 'dense',
    'name': 'sulphates',
    'num_layers': 2,
    'proc_column': 'sulphates_

In [15]:
pprint(model.config['output_features'], indent=2)

[ { 'column': 'quality',
    'dependencies': [],
    'loss': { 'class_similarities_temperature': 0,
              'class_weights': 1,
              'confidence_penalty': 0.1,
              'robust_lambda': 0,
              'type': 'softmax_cross_entropy',
              'weight': 1},
    'name': 'quality',
    'preprocessing': {'missing_value_strategy': 'drop_row'},
    'proc_column': 'quality_mZFLky',
    'reduce_dependencies': 'sum',
    'reduce_input': 'sum',
    'top_k': 2,
    'type': 'category'}]


In [16]:
eval_stats, train_stats, _, _ = model.experiment(
    dataset = train_df,
    experiment_name = 'wine_quality'
)


╒════════════════════════╕
│ EXPERIMENT DESCRIPTION │
╘════════════════════════╛

╒══════════════════╤══════════════════════════════════════════════════════════════════════════════════╕
│ Experiment name  │ wine_quality                                                                     │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ Model name       │ run                                                                              │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ Output directory │ /workspaces/ludwig/examples/wine_quality/results/wine_quality_run                │
├──────────────────┼──────────────────────────────────────────────────────────────────────────────────┤
│ ludwig_version   │ '0.6.dev'                                                                        │
├──────────────────┼─────────────────────────────────────────────────────────────────

  _warn_prf(average, modifier, msg_start, len(result))



===== quality =====
accuracy: 0.531898558139801
hits_at_k: 0.8593389987945557
loss: 1.1008591651916504
overall_stats: { 'avg_f1_score_macro': 0.21084484521387975,
  'avg_f1_score_micro': 0.5318985395849347,
  'avg_f1_score_weighted': 0.49965464391497777,
  'avg_precision_macro': 0.216352148703475,
  'avg_precision_micro': 0.5318985395849347,
  'avg_precision_weighted': 0.5318985395849347,
  'avg_recall_macro': 0.21655309024127298,
  'avg_recall_micro': 0.5318985395849347,
  'avg_recall_weighted': 0.5318985395849347,
  'kappa_score': 0.2505462635572683,
  'token_accuracy': 0.5318985395849347}
per_class_stats: {<UNK>: {   'accuracy': 1.0,
    'f1_score': 0,
    'fall_out': 0.0,
    'false_discovery_rate': 1.0,
    'false_negative_rate': 1.0,
    'false_negatives': 0,
    'false_omission_rate': 0.0,
    'false_positive_rate': 0.0,
    'false_positives': 0,
    'hit_rate': 0,
    'informedness': 0.0,
    'markedness': 0.0,
    'matthews_correlation_coefficient': 0,
    'miss_rate': 1.0,
 

## Cleanup

In [17]:
try:
    shutil.rmtree('./results')
    items = os.listdir('./')
    for item in items:
        if item.endswith(".hdf5") or item.endswith(".json") or item == '.lock_preprocessing':
            os.remove(os.path.join('./', item))
except Exception as e:
    pass 