In [None]:
import pandas as pd
from sklearn import datasets 

In [None]:
data = datasets.load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)

X = data.data #input 

y = data.target #output

df.to_csv('diabetes_data.csv', index = False)
df.columns

### data preprocessing : clean the data, convert the categorical (text) to numerical values
### input to ML model must be number. (X - input, y- output) 

#### Step 1: split the data into training and testing set (to avoid overfit)

#### Step 2: train ML model using the available training set

#### Step 3: evaluate the performance for the training set, testing set 

#### Step 4: fine-tune parameters to balance the performance 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [None]:
def my_linear_regression(X_train,y_train):
    
    steps = [("imp_mean", SimpleImputer()), # cleaning the data - replace the missing data with average
            ("scale", StandardScaler()),  #standardizing the data 
            ("linear", LinearRegression()) ] # linear model - ML 

    pipeline = Pipeline(steps)

    model = pipeline.fit(X_train, y_train)
 

    linear_regressor = pipeline.named_steps['linear']

    return model, linear_regressor.intercept_, linear_regressor.coef_

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

df_slr = {}
for input in df.columns:
    X = df[[input]].values
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)
    model, a, b = my_linear_regression(X_train,y_train) 
    y_pred = model.predict(X_train)    
    mae_tr = mean_absolute_error(y_train, y_pred)
    r2_tr = r2_score(y_train,y_pred)
    y_pred = model.predict(X_train)    
    mae = mean_absolute_error(y_train, y_pred)    
    r2 = r2_score(y_train,y_pred)
   # plt.scatter(X_train, y_train, label='Data Points')
   # plt.plot(X_train, y_pred, color='red', label='Linear Regression')
   # plt.show()   
    df_slr[input] = [b[0], mae_tr, mae,  r2_tr, r2]

df_simple = pd.DataFrame.from_dict(df_slr, orient ='index')
df_simple.columns = ['coef', 'mae_tr', 'mae_test', 'r2_tr', 'r2_test']

df_simple

In [None]:
X = df.values
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)
model, a, b = my_linear_regression(X_train,y_train) 
y_pred = model.predict(X_train) 
    
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))