In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
diabetes = datasets.load_diabetes()
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [3]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

In [4]:
# columns
diabetes.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [5]:
# Now we will split the data into the independent and independent variable
X = diabetes.data
Y = diabetes.target

In [6]:
X.shape, Y.shape

((442, 10), (442,))

In [7]:
Y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [8]:
# We will split the data into training and testing data
from sklearn.model_selection import train_test_split

In [9]:
train_x, test_x, train_y, test_y = train_test_split(X,Y,test_size=0.3,random_state=99)

In [10]:
train_x.shape, train_y.shape

((309, 10), (309,))

Now we will perform linear regression

In [11]:
# Linear Regression
from sklearn.linear_model import LinearRegression

In [12]:
le = LinearRegression()

In [13]:
le.fit(train_x,train_y)

LinearRegression()

In [14]:
y_pred = le.predict(test_x)
y_pred

array([ 77.99793282, 170.44431566, 109.039494  , 223.84320605,
        87.38145887, 211.46851413, 223.65995098,  52.81631391,
       149.39219589, 294.99052864, 127.72911792, 182.9049973 ,
       102.6373417 , 144.69404091, 171.51921663, 266.17716148,
       201.88864732, 166.18475146, 103.67405763, 169.01824607,
       187.13974387, 130.10116021, 151.54240239, 156.45632158,
       121.85525279, 304.13863719, 126.54843582, 158.766251  ,
       249.42250153, 154.22177899, 180.85286248, 180.0620116 ,
       182.96338591, 199.99914067,  73.87305575, 146.19256445,
       165.52824755, 160.9337753 , 247.99119911, 210.3722983 ,
        85.70144066, 211.07459023, 188.10304059, 119.60313173,
       151.80679986, 188.31455646, 185.6907796 , 168.92706031,
       291.55359958, 248.59960366, 170.16833469, 208.55225625,
        59.07864944, 195.30667704, 190.19691947, 149.9763055 ,
       114.48049294, 244.83333467, 254.54879995, 138.88842999,
       301.04796681,  57.72046739, 162.93923953, 187.60

In [15]:
result = pd.DataFrame({'Actual': test_y, 'Predict' : y_pred})
result

Unnamed: 0,Actual,Predict
0,75.0,77.997933
1,128.0,170.444316
2,125.0,109.039494
3,332.0,223.843206
4,37.0,87.381459
...,...,...
128,48.0,202.350943
129,172.0,144.660842
130,51.0,82.400610
131,277.0,185.445124


In [16]:
# we will check the accuracy

print('coefficient', le.coef_)
print('intercept', le.intercept_)

coefficient [  40.66018999 -313.29560706  517.1785363   386.06685795 -604.64498104
  275.32058758    3.91393457  172.38010275  661.95935148   62.25715134]
intercept 155.59114167162846


In [17]:
from sklearn.metrics import mean_squared_error, r2_score

In [18]:
# mean_squared_error
mean_squared_error(test_y,y_pred)

3157.9566009965824

In [19]:
# r2 score
r2_score(test_y,y_pred)

0.4545737971700595