In [67]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [48]:
diabetes = datasets.load_diabetes()
print(diabetes.data.shape)
print(diabetes.target.shape)
print(diabetes)

(442, 10)
(442,)
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]]), 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [5]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [40]:
# columns
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [41]:
# Now we will split the data into the independent and independent variable
X = diabetes.data
Y = diabetes.target
print(X.shape, Y.shape,Y)


(442, 10) (442,) [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.

In [25]:
# We will split the data into training and testing data
from sklearn.model_selection import train_test_split

In [26]:
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.2,random_state=40)

In [27]:
train_x.shape, train_y.shape

((353, 10), (353,))

In [28]:
# Linear Regression
from sklearn.linear_model import LinearRegression

In [29]:
le = LinearRegression()

In [30]:
le.fit(train_x,train_y)

In [31]:
y_pred = le.predict(test_x)
y_pred

array([ 89.94412785, 173.60335656, 213.38080171, 217.89661039,
       153.9701675 , 223.8077706 , 151.39524868,  84.96259998,
       176.43549981, 141.07987766, 168.28877947, 119.08278101,
        51.69097061, 166.79152635,  70.17678081, 185.92399084,
       152.72890958, 145.49534759,  96.03321922, 144.3725306 ,
        56.17472668, 261.24044608, 136.78254163, 139.42168644,
        68.80468205, 192.35762476, 141.74727637, 114.23873408,
       179.76147833,  70.52108469, 163.44397673, 148.47202145,
       125.03051387, 161.22976524,  94.6920707 ,  56.84902996,
       114.75682892,  69.11595008, 204.8745997 , 147.9838916 ,
       154.46184318,  70.31800018, 165.22931274, 150.02038975,
       194.58669883, 148.69062893,  93.43939778, 175.82371724,
        91.76481697, 260.12583097, 217.43524908, 166.38334376,
       230.26410402, 212.054592  , 129.84559171, 193.39458414,
       118.39295725, 133.73892854, 119.92575904, 167.95702221,
       175.05415796,  50.64220965, 190.87814031, 166.76

In [32]:
result = pd.DataFrame({'Actual': test_y, 'Predict' : y_pred})
result

Unnamed: 0,Actual,Predict
0,91.0,89.944128
1,91.0,173.603357
2,52.0,213.380802
3,310.0,217.896610
4,259.0,153.970168
...,...,...
84,346.0,276.828508
85,253.0,110.613573
86,90.0,47.133719
87,142.0,143.717678


In [33]:
# we will check the accuracy

print('coefficient', le.coef_)
print('intercept', le.intercept_)

coefficient [   24.30462932  -234.27127176   558.68867082   303.39881853
 -1014.8231407    576.90082476   167.9045176    188.26569788
   866.2504982     18.32902854]
intercept 150.81397169632874


In [34]:
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# mean_squared_error
mean_squared_error(test_y,y_pred)

4189.696572705125

In [36]:
# r2 score
r2_score(test_y,y_pred)

0.35116056151211583

In [49]:
# columns
diabetes.feature_names
# Now we will split the data into the independent and independent variable
X = diabetes.data
Y = diabetes.target
print(X.shape, Y.shape,Y,X)


(442, 10) (442,) [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.

In [50]:
X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
X_norm

array([[ 0.32100638,  0.42726865,  0.52014193, ..., -0.02185457,
         0.16783396, -0.14876911],
       [-0.01166161, -0.27661457, -0.31895057, ..., -0.24471426,
        -0.42340521, -0.57132725],
       [ 0.65740682,  0.39059652,  0.34258975, ..., -0.01997881,
         0.02205238, -0.1998476 ],
       ...,
       [ 0.42498708,  0.51640371, -0.16207644, ..., -0.11289447,
        -0.47770833,  0.15784238],
       [-0.4486938 , -0.44049558,  0.38544075, ...,  0.26207365,
         0.43938148, -0.25586427],
       [-0.19283454, -0.18931121, -0.30969865, ..., -0.16747907,
        -0.01790212,  0.0129952 ]])

In [51]:
# We will split the data into training and testing data
from sklearn.model_selection import train_test_split

In [52]:
train_x_norm, test_x_norm, train_y, test_y = train_test_split(X,Y,test_size=0.2,random_state=40)


In [53]:
print(train_x_norm.shape, train_y.shape)

(353, 10) (353,)


In [54]:
# Linear Regression
from sklearn.linear_model import LinearRegression

In [55]:
le = LinearRegression()

In [57]:
le.fit(train_x_norm,train_y)

In [58]:
y_pred = le.predict(test_x_norm)
y_pred

array([ 89.94412785, 173.60335656, 213.38080171, 217.89661039,
       153.9701675 , 223.8077706 , 151.39524868,  84.96259998,
       176.43549981, 141.07987766, 168.28877947, 119.08278101,
        51.69097061, 166.79152635,  70.17678081, 185.92399084,
       152.72890958, 145.49534759,  96.03321922, 144.3725306 ,
        56.17472668, 261.24044608, 136.78254163, 139.42168644,
        68.80468205, 192.35762476, 141.74727637, 114.23873408,
       179.76147833,  70.52108469, 163.44397673, 148.47202145,
       125.03051387, 161.22976524,  94.6920707 ,  56.84902996,
       114.75682892,  69.11595008, 204.8745997 , 147.9838916 ,
       154.46184318,  70.31800018, 165.22931274, 150.02038975,
       194.58669883, 148.69062893,  93.43939778, 175.82371724,
        91.76481697, 260.12583097, 217.43524908, 166.38334376,
       230.26410402, 212.054592  , 129.84559171, 193.39458414,
       118.39295725, 133.73892854, 119.92575904, 167.95702221,
       175.05415796,  50.64220965, 190.87814031, 166.76

In [59]:
result = pd.DataFrame({'Actual': test_y, 'Predict' : y_pred})
result

Unnamed: 0,Actual,Predict
0,91.0,89.944128
1,91.0,173.603357
2,52.0,213.380802
3,310.0,217.896610
4,259.0,153.970168
...,...,...
84,346.0,276.828508
85,253.0,110.613573
86,90.0,47.133719
87,142.0,143.717678


In [60]:
from sklearn.metrics import mean_squared_error, r2_score
# mean_squared_error
print(mean_squared_error(test_y,y_pred))
# r2 score
print(r2_score(test_y,y_pred))

4189.696572705125
0.35116056151211583


In [68]:
diabetes = datasets.load_diabetes()
print(diabetes.data.shape)
print(diabetes.target.shape)
print(diabetes)

(442, 10)
(442,)
{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]]), 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [69]:
# Now we will split the data into the independent and independent variable
X = diabetes.data
Y = diabetes.target
print(X.shape, Y.shape,Y)

(442, 10) (442,) [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.

In [70]:
X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
X_norm

array([[ 0.32100638,  0.42726865,  0.52014193, ..., -0.02185457,
         0.16783396, -0.14876911],
       [-0.01166161, -0.27661457, -0.31895057, ..., -0.24471426,
        -0.42340521, -0.57132725],
       [ 0.65740682,  0.39059652,  0.34258975, ..., -0.01997881,
         0.02205238, -0.1998476 ],
       ...,
       [ 0.42498708,  0.51640371, -0.16207644, ..., -0.11289447,
        -0.47770833,  0.15784238],
       [-0.4486938 , -0.44049558,  0.38544075, ...,  0.26207365,
         0.43938148, -0.25586427],
       [-0.19283454, -0.18931121, -0.30969865, ..., -0.16747907,
        -0.01790212,  0.0129952 ]])

In [71]:
Y_norm = Y / np.linalg.norm(Y, keepdims=True)
Y_norm

array([0.04212208, 0.02092156, 0.03933254, 0.05746456, 0.03765881,
       0.02705856, 0.03849568, 0.01757411, 0.03068496, 0.0864758 ,
       0.02817437, 0.01924784, 0.0499328 , 0.05160652, 0.03291659,
       0.04770116, 0.04630639, 0.0401694 , 0.02705856, 0.0468643 ,
       0.01896888, 0.01366875, 0.01896888, 0.06834377, 0.05132757,
       0.05634874, 0.03821672, 0.02371111, 0.036543  , 0.07894403,
       0.03598509, 0.0164583 , 0.09512338, 0.02426901, 0.01813202,
       0.02845333, 0.07392286, 0.07699135, 0.07029645, 0.02510588,
       0.02789542, 0.01534248, 0.0170162 , 0.02566378, 0.07224913,
       0.01478457, 0.05300129, 0.03961149, 0.02092156, 0.03961149,
       0.0432379 , 0.06276469, 0.0164583 , 0.02901123, 0.05076966,
       0.03570614, 0.01450562, 0.0103213 , 0.04742221, 0.04742221,
       0.0170162 , 0.0401694 , 0.01450562, 0.03570614, 0.01980575,
       0.04546953, 0.04184313, 0.02705856, 0.04463267, 0.04965384,
       0.0133898 , 0.07531763, 0.05634874, 0.03096391, 0.02371

In [75]:
# We will split the data into training and testing data
from sklearn.model_selection import train_test_split
train_x_norm, test_x_norm, train_y_norm, test_y_norm = train_test_split(X_norm,Y_norm,test_size=0.2,random_state=40)
print(train_x_norm.shape, train_y_norm.shape)

(353, 10) (353,)


In [76]:
# Linear Regression
from sklearn.linear_model import LinearRegression
le.fit(train_x_norm,train_y_norm)
y_pred = le.predict(test_x_norm)
y_pred

array([0.02058673, 0.051766  , 0.06462855, 0.06245347, 0.04540223,
       0.05780724, 0.04252235, 0.02422784, 0.04893474, 0.03555023,
       0.04864936, 0.02971426, 0.02099056, 0.04991119, 0.02300945,
       0.05086802, 0.04052093, 0.03719368, 0.02994192, 0.03864678,
       0.01404619, 0.0760986 , 0.03833675, 0.03839706, 0.02194782,
       0.05574353, 0.03983898, 0.02510011, 0.0511803 , 0.02497038,
       0.04594898, 0.04323102, 0.02911172, 0.04651734, 0.02726002,
       0.01945309, 0.02742661, 0.01650083, 0.05716988, 0.04004363,
       0.04125044, 0.02213928, 0.04464523, 0.04383325, 0.05708886,
       0.03938784, 0.01780075, 0.04653962, 0.02717172, 0.06583804,
       0.05962489, 0.04765119, 0.06279666, 0.05956896, 0.03879646,
       0.05194006, 0.02882954, 0.03937231, 0.02668992, 0.04561905,
       0.05495423, 0.01861961, 0.05679542, 0.04732315, 0.05740197,
       0.02831455, 0.06064507, 0.06516769, 0.04004177, 0.05776694,
       0.05984296, 0.04554622, 0.03321089, 0.04396991, 0.06882

In [77]:
from sklearn.metrics import mean_squared_error, r2_score
# mean_squared_error
print(mean_squared_error(test_y_norm,y_pred))
# r2 score
print(r2_score(test_y_norm,y_pred))

0.000345481600060323
0.3124348681896677
