# Samples with Linear Regression

In [54]:
import pandas as pd
from sklearn import datasets

In [5]:
dataset = datasets.load_boston()

In [9]:
dataset.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [10]:
print(dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [38]:
variable_objetivo = dataset['target']
variable_objetivo

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [39]:
nombres_variables_independientes = dataset["feature_names"]

In [40]:
variables_independientes = dataset["data"]

In [41]:
from sklearn.linear_model import LinearRegression

In [22]:
LinearRegression?

In [23]:
modelo = LinearRegression()

In [33]:
modelo.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [42]:
modelo.fit(X = variables_independientes, y = variable_objetivo)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [43]:
modelo.intercept_

36.45948838509011

In [44]:
modelo.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [45]:
predicciones = modelo.predict(variables_independientes)

In [63]:
for y, y_pred in list(zip(variable_objetivo, predicciones))[:5]:
    print("valor real: {:.3f}   valor estimado: {:.5f}".format(y, y_pred))

valor real: 24.000   valor estimado: 30.00384
valor real: 21.600   valor estimado: 25.02556
valor real: 34.700   valor estimado: 30.56760
valor real: 33.400   valor estimado: 28.60704
valor real: 36.200   valor estimado: 27.94352


In [55]:
df = pd.DataFrame(variables_independientes, columns=nombres_variables_independientes)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [56]:
df["MEDV"] = variable_objetivo

In [57]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [60]:
modelo.fit(X = df[nombres_variables_independientes], y = df["MEDV"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [61]:
df["MEDV_pred"] = modelo.predict(df[nombres_variables_independientes])

In [62]:
df.MEDV_pred

0      30.003843
1      25.025562
2      30.567597
3      28.607036
4      27.943524
5      25.256284
6      23.001808
7      19.535988
8      11.523637
9      18.920262
10     18.999497
11     21.586796
12     20.906522
13     19.552903
14     19.283482
15     19.297483
16     20.527510
17     16.911401
18     16.178011
19     18.406136
20     12.523858
21     17.671037
22     15.832881
23     13.806285
24     15.678338
25     13.386686
26     15.463977
27     14.708474
28     19.547373
29     20.876428
         ...    
476    20.534816
477    11.542727
478    19.204963
479    21.862764
480    23.468789
481    27.098873
482    28.569943
483    21.083988
484    19.455162
485    22.222259
486    19.655920
487    21.325361
488    11.855837
489     8.223867
490     3.663997
491    13.759085
492    15.931185
493    20.626621
494    20.612494
495    16.885420
496    14.013208
497    19.108541
498    21.298052
499    18.454988
500    20.468708
501    23.533341
502    22.375719
503    27.6274

Normalize the model

In [None]:
modelo_normalizado = LinearRegression(normalize = True)

Trainig the model

In [65]:
modelo_normalizado.fit(X = df[nombres_variables_independientes], y = df["MEDV"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Predict with the model

In [66]:
df["MEDV_pred2"] = modelo_normalizado.predict(df[nombres_variables_independientes])

In [71]:
df[["MEDV_pred", "MEDV_pred2"]]

Unnamed: 0,MEDV_pred,MEDV_pred2
0,30.003843,30.003843
1,25.025562,25.025562
2,30.567597,30.567597
3,28.607036,28.607036
4,27.943524,27.943524
5,25.256284,25.256284
6,23.001808,23.001808
7,19.535988,19.535988
8,11.523637,11.523637
9,18.920262,18.920262
