# Lode data

In [10]:
import pandas as pd

In [11]:
from sklearn.datasets import load_diabetes


In [12]:
diabetes = load_diabetes(as_frame=True)
df = diabetes['data']

In [13]:
df.shape

(442, 10)

In [14]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641


In [15]:
?load_diabetes

[1;31mSignature:[0m [0mload_diabetes[0m[1;33m([0m[1;33m*[0m[1;33m,[0m [0mreturn_X_y[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mas_frame[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m [0mscaled[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load and return the diabetes dataset (regression).

Samples total    442
Dimensionality   10
Features         real, -.2 < x < .2
Targets          integer 25 - 346

.. note::
   The meaning of each feature (i.e. `feature_names`) might be unclear
   (especially for `ltg`) as the documentation of the original dataset is
   not explicit. We provide information that seems correct in regard with
   the scientific literature in this field of research.

Read more in the :ref:`User Guide <diabetes_dataset>`.

Parameters
----------
return_X_y : bool, default=False
    If True, returns ``(data, target)`` instead of a Bunch object.
    See below for more information about the `data` and `target` object.



In [17]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

Attribute Information
-   age     age in years
-   sex
-   bmi     body mass index
-   bp      average blood pressure
-   s1      tc, total serum cholesterol
-   s2      ldl, low-density lipoproteins
-   s3      hdl, high-density lipoproteins
-   s4      tch, total cholesterol / HDL
-   s5      ltg, possibly log of serum triglycerides level
-   s6      glu, blood sugar level

In [20]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [23]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [28]:
df.describe(percentiles = [0.2 , 0.6 , 0.3 , 0.5,0.75])

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
20%,-0.04547248,-0.04464164,-0.04048038,-0.04009893,-0.03871969,-0.03695017,-0.03971921,-0.03949338,-0.04117617,-0.03835666
30%,-0.02730979,-0.04464164,-0.02991782,-0.02873752,-0.02633611,-0.02480001,-0.02867429,-0.03949338,-0.02796492,-0.02468771
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
60%,0.01628068,0.05068012,0.005218854,0.008100982,0.00806271,0.008706873,0.008142084,-0.002592262,0.01255119,0.007206516
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [31]:
df.head(n=1)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646


In [30]:
df.tail(n = 10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
432,0.009016,-0.044642,0.055229,-0.00567,0.057597,0.044719,-0.002903,0.023239,0.055686,0.106617
433,-0.02731,-0.044642,-0.060097,-0.02977,0.046589,0.01998,0.122273,-0.039493,-0.051404,-0.009362
434,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045424,0.032059
435,-0.01278,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.03846,-0.038357
436,-0.05637,-0.044642,-0.074108,-0.050427,-0.02496,-0.047034,0.09282,-0.076395,-0.061176,-0.046641
437,0.041708,0.05068,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.05068,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.05068,-0.015906,0.017293,-0.037344,-0.01384,-0.024993,-0.01108,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.02656,0.044529,-0.02593
441,-0.045472,-0.044642,-0.07303,-0.081413,0.08374,0.027809,0.173816,-0.039493,-0.004222,0.003064


In [33]:
type(df['age'])

pandas.core.series.Series

In [34]:
df['age'][0] #first age

0.038075906433423026

In [35]:
#first 5 datapoints age
df['age'][:5]

0    0.038076
1   -0.001882
2    0.085299
3   -0.089063
4    0.005383
Name: age, dtype: float64

In [37]:
#last 5 datapoints age
df['age'][-5:]

437    0.041708
438   -0.005515
439    0.041708
440   -0.045472
441   -0.045472
Name: age, dtype: float64

In [38]:
#between 100 to 200
df['age'][100:200]

100    0.016281
101    0.016281
102   -0.092695
103    0.059871
104   -0.027310
         ...   
195    0.027178
196   -0.023677
197    0.048974
198   -0.052738
199    0.041708
Name: age, Length: 100, dtype: float64

In [44]:
df[['age','sex']]

Unnamed: 0,age,sex
0,0.038076,0.050680
1,-0.001882,-0.044642
2,0.085299,0.050680
3,-0.089063,-0.044642
4,0.005383,-0.044642
...,...,...
437,0.041708,0.050680
438,-0.005515,0.050680
439,0.041708,0.050680
440,-0.045472,-0.044642


In [48]:
df.iloc[1]

age   -0.001882
sex   -0.044642
bmi   -0.051474
bp    -0.026328
s1    -0.008449
s2    -0.019163
s3     0.074412
s4    -0.039493
s5    -0.068332
s6    -0.092204
Name: 1, dtype: float64