<a href="https://colab.research.google.com/github/mdkamrulhasan/data_mining_kdd/blob/main/notebooks/Regression_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

What will we cover today ?


1.   Four models:

 *   Linear Rregression (LR)
 *   Random Forest (RF)
 *   Support Vectror Regression (SVR)
 *   Boosting

2.   Overfitting
3.   K-fold Cross validation



In [38]:
import numpy as np
import pandas as pd
# Models (Sklearn)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
# Data and Evaluation packages
from sklearn import datasets
from sklearn.metrics import mean_squared_error
# visualization
import plotly.express as px

[Data description](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html)

In [39]:
# Load the diabetes dataset
df = datasets.load_diabetes(as_frame=True)
df.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [40]:
df.data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [41]:
# Load the diabetes dataset
X, y = datasets.load_diabetes(return_X_y=True)
X.shape, y.shape

((442, 10), (442,))

In [42]:
fig = px.scatter(x=df.data.bmi, y=y)
fig.show()

(50-50) % splits

In [43]:
diabetes_X_sample_features = X
# 50-50% split
train_test_split_point = len(X) // 2
# Split the data into training/testing sets
X_train = diabetes_X_sample_features[:train_test_split_point]
X_test = diabetes_X_sample_features[train_test_split_point:]
# Split the targets into training/testing sets
y_train = y[:train_test_split_point]
y_test = y[train_test_split_point:]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((221, 10), (221, 10), (221,), (221,))

Training and Testing a LR model

In [45]:
# Create linear regression object
regr = LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)

Regression model parameters

In [46]:
regr.coef_, regr.intercept_

(array([ -28.53268149, -286.52147633,  511.57440375,  251.45287434,
        -829.25045448,  385.83237137,  208.81135215,  293.74699642,
         756.35610142,  137.2482078 ]),
 150.5785322213259)

In [47]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Mean squared error: 2944.32


Random splitting

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((353, 10), (353,), (89, 10), (89,))

Our Wrapper Class (Can take any model as an input)

In [52]:
from sklearn.model_selection import cross_val_score

class myRegressionModel:
  def __init__(self, model):
    self.model = model
    self.nb_cv_splits = 3
    self.evaluation_metrics = 'neg_mean_squared_error'

  def train(self, X, y):
    self.model.fit(X, y)

  def evaluate(self, X, y):
    y_predict = self.model.predict(X)
    return mean_squared_error(y, y_predict)

  def cv_error(self, X, y):
    return cross_val_score(self.model,
                           X,
                           y, scoring=self.evaluation_metrics,
                           cv=self.nb_cv_splits)



Linear Regression (LR)

In [53]:
my_model = myRegressionModel(LinearRegression())
my_model.train(X_train, y_train)
print('train-error', my_model.evaluate(X_train, y_train))
print('test-error', my_model.evaluate(X_test, y_test))

train-error 2734.7508990757424
test-error 3424.259334298692


In [54]:
cv_scores = my_model.cv_error(X_train, y_train)
print('cross validation scores:', cv_scores)

cross validation scores: [-2896.05055713 -3175.79323527 -2821.18419637]


In [55]:
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation score (mean): -2964.3426629208825
cross validation score (std): 152.61007263273223


Random Forest (RF)

In [58]:
my_model = myRegressionModel(RandomForestRegressor())
my_model.train(X_train, y_train)
print('train-error', my_model.evaluate(X_train, y_train))
print('test-error', my_model.evaluate(X_test, y_test))

train-error 449.9323427762039
test-error 3898.6297955056175


In [59]:
cv_scores = my_model.cv_error(X_train, y_train)
print('cross validation scores:', cv_scores)

cross validation scores: [-3155.51555254 -3423.36206695 -3389.71347094]


In [60]:
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation score (mean): -3322.863696810565
cross validation score (std): 119.12768512776512


Support Vector Regression (SVR)

In [61]:
my_model = myRegressionModel(SVR())
my_model.train(X_train, y_train)
print('train-error', my_model.evaluate(X_train, y_train))
print('test-error', my_model.evaluate(X_test, y_test))

train-error 4984.556554935674
test-error 4470.939682846807


In [63]:
cv_scores = my_model.cv_error(X_train, y_train)
print('cross validation scores:', cv_scores)

cross validation scores: [-4888.31984537 -5322.89861106 -6050.53492911]


In [64]:
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation score (mean): -5420.5844618468955
cross validation score (std): 479.4739265108507


Boosting

In [65]:
my_model = myRegressionModel(GradientBoostingRegressor())
my_model.train(X_train, y_train)
print('train-error', my_model.evaluate(X_train, y_train))
print('test-error', my_model.evaluate(X_test, y_test))

train-error 871.4631754339739
test-error 4070.6987035094744


In [67]:
cv_scores = my_model.cv_error(X_train, y_train)
print('cross validation scores:', cv_scores)

cross validation scores: [-3507.59717866 -3248.91761138 -3366.14196629]


In [66]:
print('cross validation score (mean):', np.mean(cv_scores))
print('cross validation score (std):', np.std(cv_scores))

cross validation score (mean): -5420.5844618468955
cross validation score (std): 479.4739265108507
