In [1]:
import pandas as pd
import numpy as np
import textwrap
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.base import clone

In [2]:
# This is where we'll put constants
FOLD_COUNT          = 5
TRAIN_DATA_PATH     = "./data/Concrete_Data.xls"

In [3]:
def get_data_xy(data):
    X = data.iloc[:, 0:7]
    y = data.iloc[:,8]
    return X, y

def clean_data(data):
    data = data.dropna()
    return data

def read_data(path):
    dataset = pd.read_excel(path, header = 0)
    return dataset

In [4]:
dataset = read_data(TRAIN_DATA_PATH)
dataset.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [5]:
print (dataset.columns)

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')


In [6]:
def test_model_kfold(model, X, y, fold_count = FOLD_COUNT):
    """Run a kfold test on the given model. It works off of a clone of the given model."""
    kf = KFold(n_splits=fold_count, random_state=None, shuffle=True)
    avg_metrics = None
    
    # For each fold...
    # 1) clone the model to get a fresh copy
    # 2) train and test the model on the split
    # 3) aggregate the test results
    for train_index, test_index in kf.split(X):       
        train_X, test_X = X.iloc[train_index], X.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]
        
        model = clone(model)
        model.fit(train_X, train_y)
        metrics = test_model(model, test_X, test_y)
        
        if avg_metrics is None:
            avg_metrics = metrics
        else:
            avg_metrics = tuple(map(lambda x, y: x + y, avg_metrics, metrics))
        
    avg_metrics = tuple(map(lambda x: x / fold_count, avg_metrics))
    
    return avg_metrics

def test_model_split(model, train_split_xy):
    """Run a test on the given split. It works off of a clone of the given model."""
    train_X, test_X, train_y, test_y = train_split_xy (test_size=0.25)
    model = clone(model)
    model.fit(train_X, train_y)
    return test_model(model, test_X, test_y)
    
def test_model (model, test_X, test_y):
    """Get performance metrics based on the model's prediction results."""
    prediction = model.predict(test_X)
    
    mse = mean_squared_error(test_y, prediction)
    var_score = r2_score(test_y, prediction)
    y_bar_squared = (sum(test_y)/float(len(test_y)))**2
    mse_per = mse / y_bar_squared
    
    return (mse, mse_per, var_score)

def print_test_results (results):
    mse, mse_per, var_score = results
    print("MSE:")
    print(textwrap.indent(str(mse), " " * 4))
    
    print("")
    print("MSE%:")
    print(textwrap.indent(str(mse_per), " " * 4))
    
    print("")
    print("Variance Score:")
    print(textwrap.indent(str(var_score), " " * 4))

def prepare_models(models, X, y):
    """Prepare the given models and print training results"""
    for model_name, model in models:
        print("'{0}' classifier".format(model_name))
        print("--------------------------------------")

        print("Testing against training data with {0} folds...".format(FOLD_COUNT))
        print("")

        test_results = test_model_kfold(model, X, y, fold_count=FOLD_COUNT)
        print_test_results(test_results)
        print("")

        print("Training against training data...")
        model.fit(X, y)

        print("--------------------------------------")
        print("")
    return models

def test_models(models, test_X, test_y):
    """Test the given models against the testing data"""
    for model_name, model in models:
        print("Testing '{0}' classifier against testing data".format(model_name))
        print("--------------------------------------")
        test_results = test_model(model, test_X, test_y)
        print_test_results(test_results)
        print("--------------------------------------")
        print("")

In [7]:
X, y = get_data_xy(dataset)

In [8]:
# Create linear regression object
models = prepare_models(
    [
        ("Linear Regression", Ridge(alpha = 0.5)),
    ],
    X,
    y
)

'Linear Regression' classifier
--------------------------------------
Testing against training data with 5 folds...

MSE:
    155.575322177

MSE%:
    0.121294630259

Variance Score:
    0.441582313156

Training against training data...
--------------------------------------

