In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

## Load Dataset and display its attributes

In [None]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
print(diabetes.keys())
print(diabetes['DESCR'])

## Use Sklearn linear regression to fit a model between BMI and disease progression 

In [None]:
# Use one feature (BMI)
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())
plt.xlabel('BMI') 
plt.ylabel('Progression') 
plt.show()


## Use numpy to do the same thing

In [None]:
def get_MSE(y,y_pred):
    '''
    Takes the Ground truth and prediction arrays as input and outputs the mean squared error between them
    '''
    return np.square(np.subtract(y,y_pred)).mean() 

In [None]:
# def estimate_coef(x, y): 
    # number of observations/points 

# Use one feature (BMI)
diabetes_X = diabetes.data[:, 2]

# Split the data into training/testing sets
X_train = diabetes_X[:-20]
X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]
    
# calculating cross-deviation and deviation about x 
SS_xy = len(X_train)*np.sum(X_train*y_train) - np.sum(X_train)*np.sum(X_train)
SS_xx = len(X_train)*np.sum(X_train*X_train) - np.sum(X_train)**2

# calculating regression coefficients 
m = SS_xy / SS_xx 
b = (np.sum(y_train) - b_1 *np.sum(X_train)) / len(X_train)

print('we end up with the following equation: Y = {}X + {}'.format(m,b))

plt.scatter(X_test, y_test, color = "m", marker = "o") 

# predicted response vector 
y_pred = b_0 + b_1*X_test 

# plotting the regression line 
plt.plot(X_test, y_pred, color = "g") 

# putting labels 
plt.xlabel('BMI') 
plt.ylabel('Progression') 

plt.show() 
  
MSE = get_MSE(y_test,y_pred)
print('The mean squared error is: ', MSE)
  



## Multivariate Linear regression

In [None]:
# Use multiple features (BMI and S1)
diabetes_X = diabetes.data[:, [2,4]]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: ', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))


# Plot outputs
# plt.scatter(diabetes_X_test[:,0], diabetes_y_test,  color='black')
# plt.scatter(diabetes_X_test[:,0], diabetes_y_test,  color='red')


 
# plt.show()


## Task: use more features 

## Gradient decent

In [None]:
diabetes_X = diabetes.data[:, 2]

# Split the data into training/testing sets
X_train = diabetes_X[:-20]
X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]



# Building the model
m = 0
c = 0

L = 0.5  # The learning Rate
epochs = 1500  # The number of iterations to perform gradient descent

n = float(len(X_train)) # Number of elements in X

# Performing Gradient Descent 
for i in range(epochs): 
    Y_pred = m*X_train + c  # The current predicted value of Y
    D_m = (-2/n) * sum(X_train * (y_train - Y_pred))  # Derivative wrt m
    D_c = (-2/n) * sum(y_train - Y_pred)  # Derivative wrt c
    m = m - L * D_m  # Update m
    c = c - L * D_c  # Update c
    
print ('We end up with the following equation Y = {}X + {}'.format(m, c))

Y_pred = m*X_test + c
plt.scatter(X_test, y_test) 
plt.plot([min(X_test), max(X_test)], [min(Y_pred), max(Y_pred)], color='red')  # regression line
plt.show()
print ('the MSE is {}'.format(get_MSE(y_test,Y_pred)))