In [1]:
# Prerequisite: Installs the latest version of Ludwig in the Colab environment
!python -m pip install git+https://github.com/ludwig-ai/ludwig.git --quiet

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 398 kB 8.5 MB/s 
[K     |████████████████████████████████| 47 kB 4.4 MB/s 
[K     |████████████████████████████████| 136 kB 37.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 42.5 MB/s 
[K     |████████████████████████████████| 94 kB 3.8 MB/s 
[K     |████████████████████████████████| 144 kB 72.7 MB/s 
[K     |████████████████████████████████| 271 kB 55.7 MB/s 
[?25h  Building wheel for ludwig (PEP 517) ... [?25l[?25hdone


In [2]:
!ludwig datasets download adult_census_income

NumExpr defaulting to 2 threads.
███████████████████████
█ █ █ █  ▜█ █ █ █ █   █
█ █ █ █ █ █ █ █ █ █ ███
█ █   █ █ █ █ █ █ █ ▌ █
█ █████ █ █ █ █ █ █ █ █
█     █  ▟█     █ █   █
███████████████████████
ludwig v0.5rc2 - Datasets download



In [3]:
import pandas as pd
import numpy as np

raw_df = pd.read_csv('./adult_census_income.csv')
raw_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,split
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


In [8]:
config_yaml = """
preprocessing:
  number:
    normalization: zscore
    missing_value_strategy: fill_with_mean

input_features:
  - name: age
    type: number
  - name: workclass
    type: category
  - name: fnlwgt
    type: number
  - name: education
    type: category
  - name: education-num
    type: number
  - name: marital-status
    type: category
  - name: occupation
    type: category
  - name: relationship
    type: category
  - name: race
    type: category
  - name: sex
    type: category
  - name: capital-gain
    type: number
  - name: capital-loss
    type: number
  - name: hours-per-week
    type: number
  - name: native-country
    type: category

combiner:
  type: concat
  num_fc_layers: 3
  output_size: 128
  dropout: 0.2

output_features:
  - name: income
    type: binary
    preprocessing:
      fallback_true_label: " >50K"
    num_fc_layers: 4
    output_size: 32

trainer:
  epochs: 5
  optimizer: 
    type: sgd
"""

# Writes config to a file called "config.yaml"
with open("config.yaml", "w") as f:
  f.write(config_yaml)

In [9]:
# Trains the model. This cell might take a few minutes.
!ludwig train --dataset adult_census_income.csv \
              --config config.yaml \
              --skip_save_processed_input

NumExpr defaulting to 2 threads.
import ray failed with exception: No module named 'ray'
███████████████████████
█ █ █ █  ▜█ █ █ █ █   █
█ █ █ █ █ █ █ █ █ █ ███
█ █   █ █ █ █ █ █ █ ▌ █
█ █████ █ █ █ █ █ █ █ █
█     █  ▟█     █ █   █
███████████████████████
ludwig v0.5rc2 - Train


╒════════════════════════╕
│ EXPERIMENT DESCRIPTION │
╘════════════════════════╛

╒══════════════════╤════════════════════════════════════════════════════════════════════════════╕
│ Experiment name  │ experiment                                                                 │
├──────────────────┼────────────────────────────────────────────────────────────────────────────┤
│ Model name       │ run                                                                        │
├──────────────────┼────────────────────────────────────────────────────────────────────────────┤
│ Output directory │ /content/results/experiment_run                                            │
├──────────────────┼────────────────────────────

In [10]:
# Extract subset of test data for evaluation due to limitations in amount of data displayable in colab notebook.
np.random.seed(13)
raw_df.loc[raw_df.split == 2].sample(n=200).to_csv('evaluation_dataset.csv', index=False)

In [11]:
# Generates predictions and performance statistics for the test set.
!ludwig evaluate --model_path results/experiment_run/model \
                 --dataset evaluation_dataset.csv \
                 --split full \
                 --output_directory test_results

NumExpr defaulting to 2 threads.
import ray failed with exception: No module named 'ray'
███████████████████████
█ █ █ █  ▜█ █ █ █ █   █
█ █ █ █ █ █ █ █ █ █ ███
█ █   █ █ █ █ █ █ █ ▌ █
█ █████ █ █ █ █ █ █ █ █
█     █  ▟█     █ █   █
███████████████████████
ludwig v0.5rc2 - Evaluate

Dataset path: evaluation_dataset.csv
Model path: results/experiment_run/model

  embedding_size (50) is greater than vocab_size (10). Setting embedding size to be equal to vocab_size.
  embedding_size (50) is greater than vocab_size (17). Setting embedding size to be equal to vocab_size.
  embedding_size (50) is greater than vocab_size (8). Setting embedding size to be equal to vocab_size.
  embedding_size (50) is greater than vocab_size (16). Setting embedding size to be equal to vocab_size.
  embedding_size (50) is greater than vocab_size (7). Setting embedding size to be equal to vocab_size.
  embedding_size (50) is greater than vocab_size (6). Setting embedding size to be equal to vocab_size.
  embeddin