In [1]:
import pandas as pd
from sklearn import datasets 

In [2]:
data = datasets.load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)

X = data.data #input 

y = data.target #output

df.to_csv('diabetes_data.csv', index = False)
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

### data preprocessing : clean the data, convert the categorical (text) to numerical values
### input to ML model must be number. (X - input, y- output) 

#### Step 1: split the data into training and testing set (to avoid overfit)

#### Step 2: train ML model using the available training set

#### Step 3: evaluate the performance for the training set, testing set 

#### Step 4: fine-tune parameters to balance the performance 

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [4]:
def my_linear_regression(X_train,y_train):
    
    steps = [("imp_mean", SimpleImputer()), # cleaning the data - replace the missing data with average
            ("scale", StandardScaler()),  #standardizing the data 
            ("linear", LinearRegression()) ] # linear model - ML 

    pipeline = Pipeline(steps)

    model = pipeline.fit(X_train, y_train)
 

    linear_regressor = pipeline.named_steps['linear']

    return model, linear_regressor.intercept_, linear_regressor.coef_

In [5]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_slr = {}
for input in df.columns:
    X = df[[input]].values
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)
    model, a, b = my_linear_regression(X_train,y_train) 
    y_pred = model.predict(X_train)    
    mae_tr = mean_absolute_error(y_train, y_pred)
    r2_tr = r2_score(y_train,y_pred)
    y_pred = model.predict(X_train)    
    mae = mean_absolute_error(y_train, y_pred)    
    r2 = r2_score(y_train,y_pred)
   # plt.scatter(X_train, y_train, label='Data Points')
   # plt.plot(X_train, y_pred, color='red', label='Linear Regression')
   # plt.show()   
    df_slr[input] = [b[0], mae_tr, mae,  r2_tr, r2]

df_simple = pd.DataFrame.from_dict(df_slr, orient ='index')
df_simple.columns = ['coef', 'mae_tr', 'mae_test', 'r2_tr', 'r2_test']

df_simple

Unnamed: 0,coef,mae_tr,mae_test,r2_tr,r2_test
age,14.674768,63.749998,63.749998,0.036011,0.036011
sex,3.284119,64.634123,64.634123,0.001804,0.001804
bmi,46.949899,50.952191,50.952191,0.368608,0.368608
bp,33.599098,56.148453,56.148453,0.188778,0.188778
s1,13.814511,63.539433,63.539433,0.031913,0.031913
s2,11.95029,63.950134,63.950134,0.023881,0.023881
s3,-30.810016,59.305884,59.305884,0.158737,0.158737
s4,32.419228,57.813049,57.813049,0.175752,0.175752
s5,39.933342,53.885601,53.885601,0.266665,0.266665
s6,29.223907,59.602078,59.602078,0.142814,0.142814


In [6]:
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)

model, a, b = my_linear_regression(X_train,y_train) 
y_pred = model.predict(X_train) 
    
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))

training
mae: 44 mse: 2947 r2 score: 0.51
testing
mae: 42 mse: 2833 r2 score: 0.52
