##### imports

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, RANSACRegressor, LogisticRegression

# Example: Predict Housing Prices

### read the data and acquaintance with it

In [2]:
housing = pd.read_fwf('data/housing.data', header=None)

In [3]:
housing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [4]:
housing.columns = ["crime_rate", "zoned_land", "industry", "bounds_river",
"nox_conc", "rooms", "age", "distance", "highways", "tax", "pt_ratio",
"b_estimator", "pop_status", "price"]

In [5]:
housing.head()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [6]:
housing.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crime_rate,506.0,1.71629,2.65351,0.00632,0.0819,0.250895,2.326717,9.96654
zoned_land,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
industry,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
bounds_river,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nox_conc,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
rooms,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
age,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
distance,506.0,3.696228,1.999689,0.5857,2.0737,3.1073,5.112625,9.2229
highways,506.0,4.332016,1.417166,1.0,4.0,4.0,5.0,8.0
tax,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [7]:
housing.dtypes

crime_rate      float64
zoned_land      float64
industry        float64
bounds_river      int64
nox_conc        float64
rooms           float64
age             float64
distance        float64
highways          int64
tax             float64
pt_ratio        float64
b_estimator     float64
pop_status      float64
price           float64
dtype: object

### assign the targets (the prices that we want to predict - the goal) "X"

In [8]:
targets = housing['price']

In [9]:
targets

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: price, Length: 506, dtype: float64

### assign the attributes of the model "y"

In [10]:
attributes = housing.drop('price', axis=1)

In [11]:
attributes

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48


### scaling the data

In [12]:
scalar = MinMaxScaler()
scalar.fit(attributes)

MinMaxScaler()

In [13]:
attributes_scaled = scalar.transform(attributes)

In [14]:
attributes_scaled

array([[0.        , 0.18      , 0.06781525, ..., 0.28723404, 1.        ,
        0.08967991],
       [0.00210738, 0.        , 0.24230205, ..., 0.55319149, 1.        ,
        0.2044702 ],
       [0.00210538, 0.        , 0.24230205, ..., 0.55319149, 0.98973725,
        0.06346578],
       ...,
       [0.00546574, 0.        , 0.42045455, ..., 0.89361702, 1.        ,
        0.10789183],
       [0.01036824, 0.        , 0.42045455, ..., 0.89361702, 0.99130062,
        0.13107064],
       [0.00412541, 0.        , 0.42045455, ..., 0.89361702, 1.        ,
        0.16970199]])

In [15]:
attributes_scaled.min(axis=0) # here axis = 0 means that we get the columns

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
attributes_scaled.max(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [17]:
attributes_scaled.mean(axis=0)

array([0.17167998, 0.11363636, 0.39137752, 0.06916996, 0.34916679,
       0.52186901, 0.67636355, 0.36013158, 0.47600226, 0.42220831,
       0.62292911, 0.89856783, 0.30140903])

### creat a model for the linear regression

In [18]:
model = LinearRegression()
model.fit(attributes_scaled, targets)

LinearRegression()

In [19]:
model.coef_

array([  2.08448854,   1.49403979,   0.34690497,   3.00565375,
        -7.54441381,  22.43940145,   0.27658754,  -9.35981793,
         1.35281035,  -1.26826011,  -9.07603108,   3.74177288,
       -19.03479847])

In [20]:
model.intercept_

23.689291534676943

In [21]:
model.predict(attributes_scaled[:10])

array([30.10947333, 25.12810976, 31.00785588, 29.04535626, 28.48368175,
       25.44231142, 23.23025545, 20.07399474, 11.87137324, 19.61327434])

In [22]:
targets[:10].values

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [23]:
random_sample = pd.DataFrame(attributes_scaled).sample(10)

In [24]:
random_sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
68,0.012974,0.125,0.205645,0.0,0.049383,0.389538,0.349125,0.684516,0.428571,0.301527,0.670213,1.0,0.313466
435,0.115869,0.0,0.646628,0.0,0.730453,0.587852,0.944387,0.178183,0.428571,0.914122,0.808511,0.276186,0.594371
397,0.769632,0.0,0.646628,0.0,0.633745,0.418854,0.988671,0.121301,0.428571,0.914122,0.808511,0.990418,0.501932
350,0.005601,0.4,0.028959,0.0,0.090535,0.561219,0.427394,0.950123,0.0,0.282443,0.755319,1.0,0.117274
391,0.530784,0.0,0.646628,0.0,0.648148,0.477103,0.819773,0.183173,0.428571,0.914122,0.808511,0.953301,0.469923
146,0.215731,0.0,0.70088,0.0,1.0,0.396053,1.0,0.107778,0.571429,0.412214,0.223404,0.426017,0.4117
475,0.641231,0.0,0.646628,0.0,0.409465,0.498371,0.973223,0.187596,0.428571,0.914122,0.808511,0.76262,0.617274
148,0.233395,0.0,0.70088,0.0,1.0,0.311362,0.936148,0.109283,0.571429,0.412214,0.223404,0.899365,0.73372
118,0.012476,0.0,0.350073,0.0,0.333333,0.442805,0.722966,0.219029,0.714286,0.467557,0.553191,0.853069,0.37638
16,0.105179,0.0,0.281525,0.0,0.314815,0.454876,0.271885,0.453029,0.428571,0.229008,0.893617,0.974658,0.13383


In [25]:
model.predict(random_sample)

array([17.9219948 , 12.23085729, 15.50771871, 21.7402223 , 16.0528184 ,
       16.97275495, 15.04378548, 10.7189965 , 20.78830913, 20.95155497])

In [26]:
targets.loc[random_sample.index].values

array([17.4, 13.4,  8.5, 22.9, 23.2, 15.6, 13.3, 17.8, 20.4, 23.1])

In [27]:
model.score(attributes_scaled, targets)

0.7198065414937174

### RANSAC - RANdom SAmple Consensus - Regression with Outliers

In [28]:
ransac = RANSACRegressor()
ransac.fit(attributes_scaled, targets)

RANSACRegressor()

In [29]:
ransac.estimator_.coef_

array([-8.78566763e+00,  5.93992061e-01, -7.05177857e+00, -4.17505384e-01,
        2.10710191e+00,  4.74679544e+01, -8.84928407e+00, -4.87356097e+00,
        3.34521937e+00, -2.54931147e+00, -7.51109572e+00, -2.60498612e-02,
        1.06201039e+01])

In [30]:
ransac.estimator_.intercept_

7.9020547112075175

In [31]:
ransac.predict(random_sample)

array([17.51519945, 21.87750122,  6.79301507, 21.13037949, 12.54086283,
       16.14051752, 12.26753014, 15.93049815, 20.59743359, 18.16799356])

In [32]:
ransac.score(attributes_scaled, targets)

0.40171908852923033

In [33]:
ransac.inlier_mask_ # check wich samples use RANSAC

array([ True,  True, False, False, False, False,  True,  True, False,
        True, False,  True, False, False,  True, False, False,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False,

In [34]:
ransac.inlier_mask_.sum()

289

In [35]:
inliars = attributes_scaled[ransac.inlier_mask_]

In [36]:
inliars

array([[0.        , 0.18      , 0.06781525, ..., 0.28723404, 1.        ,
        0.08967991],
       [0.00210738, 0.        , 0.24230205, ..., 0.55319149, 1.        ,
        0.2044702 ],
       [0.00822974, 0.125     , 0.27162757, ..., 0.27659574, 0.99672197,
        0.29525386],
       ...,
       [0.02189309, 0.        , 0.33834311, ..., 0.70212766, 1.        ,
        0.34768212],
       [0.00565349, 0.        , 0.42045455, ..., 0.89361702, 0.98761914,
        0.21909492],
       [0.00412541, 0.        , 0.42045455, ..., 0.89361702, 1.        ,
        0.16970199]])

In [37]:
outliars = attributes_scaled[~ransac.inlier_mask_] # ''~'' tilda will give us the oposite

In [38]:
outliars

array([[0.00210538, 0.        , 0.24230205, ..., 0.55319149, 0.98973725,
        0.06346578],
       [0.0026154 , 0.        , 0.06304985, ..., 0.64893617, 0.99427606,
        0.03338852],
       [0.00629805, 0.        , 0.06304985, ..., 0.64893617, 1.        ,
        0.09933775],
       ...,
       [0.00391056, 0.        , 0.42045455, ..., 0.89361702, 1.        ,
        0.20281457],
       [0.00546574, 0.        , 0.42045455, ..., 0.89361702, 1.        ,
        0.10789183],
       [0.01036824, 0.        , 0.42045455, ..., 0.89361702, 0.99130062,
        0.13107064]])

In [39]:
ransac.score(inliars, targets.loc[ransac.inlier_mask_])

0.9464939716492697

In [40]:
ransac.score(outliars, targets.loc[~ransac.inlier_mask_])

0.05659711105927656

### polynomial regression 

In [41]:
polynomial_features = PolynomialFeatures()

In [42]:
attributes_scaled_poly = polynomial_features.fit_transform(attributes_scaled)

In [43]:
attributes_scaled_poly

array([[1.        , 0.        , 0.18      , ..., 1.        , 0.08967991,
        0.00804249],
       [1.        , 0.00210738, 0.        , ..., 1.        , 0.2044702 ,
        0.04180806],
       [1.        , 0.00210538, 0.        , ..., 0.97957983, 0.06281445,
        0.00402791],
       ...,
       [1.        , 0.00546574, 0.        , ..., 1.        , 0.10789183,
        0.01164065],
       [1.        , 0.01036824, 0.        , ..., 0.98267692, 0.12993041,
        0.01717951],
       [1.        , 0.00412541, 0.        , ..., 1.        , 0.16970199,
        0.02879876]])

In [44]:
attributes_scaled_poly.shape

(506, 105)

### try with quadratic regression

In [45]:
model_quadratic = LinearRegression()
model_quadratic.fit(attributes_scaled_poly, targets)
model_quadratic.score(attributes_scaled_poly, targets)

0.8523438713207738

In [46]:
iris = pd.read_csv('data/iris.data', header=None)
iris.columns = ['special_length', 'special_width', 'petal_length', 'petal_width', 'class']
iris

Unnamed: 0,special_length,special_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [47]:
iris_attributes = iris.drop('class', axis=1)
iris_attributes

Unnamed: 0,special_length,special_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [48]:
iris_class_targets = iris['class']
iris_class_targets

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 150, dtype: object

In [49]:
iris_attributes_scaled = MinMaxScaler().fit_transform(iris_attributes)
iris_attributes_scaled

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [50]:
iris_model = LogisticRegression(C=1e9)

In [51]:
iris_model.fit(iris_attributes_scaled, iris_class_targets)

LogisticRegression(C=1000000000.0)

In [52]:
iris_model.coef_

array([[-18.29679129,  29.19813694, -37.89417388, -38.3219437 ],
       [ 13.58611302,  -6.58018128,  -8.87560643,  -2.78717031],
       [  4.71067827, -22.61795566,  46.76978031,  41.10911401]])

In [53]:
iris_model.intercept_

array([ 25.9014194 ,  14.7266206 , -40.62803999])

In [54]:
iris_model.score(iris_attributes_scaled, iris_class_targets)

0.9866666666666667