In [None]:
# =============================================================
# Copyright © 2020 Intel Corporation
# 
# SPDX-License-Identifier: MIT
# =============================================================

# End-to-end Census workload with Intel® Distribution of Modin and Intel® Extension for Scikit-learn

In this example we will be running an end-to-end machine learning workload with US census data from 1970 to 2010.
It uses Intel® Distribution of Modin with HDK (Heterogeneous Data Kernels) as backend compute engine for ETL, and uses Ridge Regression algorithm from Intel scikit-learn-extension library to train and predict the co-relation between US total income and education levels.

Let's start by downloading census data to your local disk.

In [None]:
!wget https://storage.googleapis.com/intel-optimized-tensorflow/datasets/ipums_education2income_1970-2010.csv.gz

Import basic python modules and disable warnings to avoid output cluttering

In [None]:
import os
import numpy as np
import warnings

warnings.filterwarnings("ignore")

Import Modin and set HDK as the compute engine. This engine provides a set of components for federating analytic queries to an execution backend based on OmniSciDB to obtain high single-node scalability for specific set of dataframe operations. 

In [None]:
#import pandas as pd
import modin.pandas as pd

import modin.config as cfg
cfg.StorageFormat.put('hdk')


Import Intel(R) Extension for Scikit-learn which dynamically patches scikit-learn estimators to use Intel(R) oneAPI Data Analytics Library as the underlying solver, while getting the same solution faster.

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn import config_context
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm

Read and load the data into a dataframe from the downloaded archive file

In [None]:
df = pd.read_csv('ipums_education2income_1970-2010.csv.gz')

Run ETL operations to prepare and transform the ingested dataset into a form that can be readily consumed by the ridge regression algorithm. Keep columns that are relevant, clean up the samples with invalid income, education and normalize the income to account for yearly inflation

In [None]:
# clean up features
keep_cols = [
    "YEAR", "DATANUM", "SERIAL", "CBSERIAL", "HHWT",
    "CPI99", "GQ", "PERNUM", "SEX", "AGE",
    "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP",
    "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP",
    "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD",
]
df = df[keep_cols]

# clean up samples with invalid income, education, etc.
df = df[df["INCTOT"] != 9999999]
df = df[df["EDUC"] != -1]
df = df[df["EDUCD"] != -1]

# normalize income for inflation
df["INCTOT"] = df["INCTOT"] * df["CPI99"]

for column in keep_cols:
    df[column] = df[column].fillna(-1)
    df[column] = df[column].astype("float64")

y = df["EDUC"]
X = df.drop(columns=["EDUC", "CPI99"])

Train the model and run prediction. Loop 50 times to remove any bias in splitting the dataset into train & test set, in order to reduce chance of over-fitting from selecting a train set that fits the model too well to the test set

In [None]:
# ML - training and inference
clf = lm.Ridge()

mse_values, cod_values = [], []
N_RUNS = 50
TRAIN_SIZE = 0.9
random_state = 777

X = np.ascontiguousarray(X, dtype=np.float64)
y = np.ascontiguousarray(y, dtype=np.float64)

# cross validation
for i in range(N_RUNS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE,
                                                        random_state=random_state)
    random_state += 777

    # training
    with config_context(assume_finite=True):
        model = clf.fit(X_train, y_train)

    # inference
    y_pred = model.predict(X_test)

    mse_values.append(mean_squared_error(y_test, y_pred))
    cod_values.append(r2_score(y_test, y_pred))

Check the regression results by calculating the accuracy of the prediction using mean squared error and r square score

In [None]:
mean_mse = sum(mse_values)/len(mse_values)
mean_cod = sum(cod_values)/len(cod_values)
mse_dev = pow(sum([(mse_value - mean_mse)**2 for mse_value in mse_values])/(len(mse_values) - 1), 0.5)
cod_dev = pow(sum([(cod_value - mean_cod)**2 for cod_value in cod_values])/(len(cod_values) - 1), 0.5)
print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(mean_mse, mse_dev))
print("mean COD ± deviation: {:.9f} ± {:.9f}".format(mean_cod, cod_dev))

Verify the accuracy:
mean MSE ± deviation: 0.032564569 ± 0.000041799
mean COD ± deviation: 0.995367533 ± 0.000005869

In [None]:
# release resources
%reset -f