In [1]:
import pandas as pd
from sklearn import datasets 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
data = datasets.load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)

X = data.data #input 

y = data.target #output

df.to_csv('diabetes_data.csv', index = False)
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

steps = [("imp_mean", SimpleImputer()), # cleaning the data - replace the missing data with average
            ("scale", StandardScaler()),  #standardizing the data 
            ('transform', PolynomialFeatures(degree=2)),
            ("linear", LinearRegression()) ] # linear model - ML 

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)

y_pred = model.predict(X_train) 
    
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))

training
mae: 39 mse: 2409 r2 score: 0.61
testing
mae: 44 mse: 3175 r2 score: 0.41
