<a href="https://colab.research.google.com/github/miuceo/ML_intro/blob/main/california_housing_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(url)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=35)

In [4]:
housing = train_set.drop('median_house_value', axis=1)
housing_labels = train_set['median_house_value'].copy()

In [5]:
housing.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,166
population,0
households,0
median_income,0
ocean_proximity,0


In [6]:
# 1. Dropping rows with NaN values
housing_dropped_rows = housing.dropna(subset=['total_bedrooms'])

# 2. Dropping column with NaN values
housing_dropped_columns = housing.drop('total_bedrooms', axis=1)

# 3. Filling NaN values with other values
median = housing['total_bedrooms'].median()
housing['total_bedrooms'].fillna(median, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing['total_bedrooms'].fillna(median, inplace = True)


In [7]:
housing.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
ocean_proximity,0


**SimpleImputer**

In [8]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

In [9]:
housing_numeric = housing.drop('ocean_proximity', axis=1)
x = imputer.fit_transform(housing_numeric)

In [10]:
imputer.statistics_

array([-118.495 ,   34.26  ,   29.    , 2120.5   ,  434.    , 1168.5   ,
        408.    ,    3.5338])

In [11]:
x = imputer.transform(housing_numeric)
x

array([[-1.2209e+02,  3.8000e+01,  6.0000e+00, ...,  4.3770e+03,
         1.7890e+03,  5.2015e+00],
       [-1.1693e+02,  3.3930e+01,  1.3000e+01, ...,  3.2970e+03,
         1.4690e+03,  2.0549e+00],
       [-1.1825e+02,  3.3970e+01,  3.7000e+01, ...,  8.1400e+02,
         2.1300e+02,  2.2917e+00],
       ...,
       [-1.1796e+02,  3.4480e+01,  3.2000e+01, ...,  8.0600e+02,
         2.9900e+02,  4.5769e+00],
       [-1.2184e+02,  3.7320e+01,  2.2000e+01, ...,  2.4910e+03,
         5.3000e+02,  4.3419e+00],
       [-1.1832e+02,  3.4200e+01,  3.6000e+01, ...,  8.3400e+02,
         3.1100e+02,  3.9866e+00]])

In [12]:
type(x)

numpy.ndarray

In [13]:
housing_tr = pd.DataFrame(x, columns = housing_numeric.columns, index = housing_numeric.index)
housing_tr

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
1380,-122.09,38.00,6.0,10191.0,1882.0,4377.0,1789.0,5.2015
12294,-116.93,33.93,13.0,7804.0,1594.0,3297.0,1469.0,2.0549
7387,-118.25,33.97,37.0,794.0,210.0,814.0,213.0,2.2917
14454,-117.27,32.83,39.0,1877.0,426.0,805.0,409.0,3.8750
2927,-119.01,35.36,24.0,1941.0,484.0,1277.0,435.0,1.0560
...,...,...,...,...,...,...,...,...
19391,-120.85,37.78,25.0,421.0,434.0,303.0,106.0,2.2679
15393,-116.90,33.22,11.0,4132.0,773.0,2012.0,703.0,3.1906
9143,-117.96,34.48,32.0,1896.0,342.0,806.0,299.0,4.5769
17679,-121.84,37.32,22.0,3015.0,581.0,2491.0,530.0,4.3419


**Simple Encoder**

In [14]:
housing_cat = housing[['ocean_proximity']]
housing_cat.head()

Unnamed: 0,ocean_proximity
1380,NEAR BAY
12294,INLAND
7387,<1H OCEAN
14454,NEAR OCEAN
2927,INLAND


In [15]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

housing_cat_encoder = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoder[:10]

array([[3.],
       [1.],
       [0.],
       [4.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.]])

In [16]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1380,-122.09,38.0,6.0,10191.0,1882.0,4377.0,1789.0,5.2015,NEAR BAY
12294,-116.93,33.93,13.0,7804.0,1594.0,3297.0,1469.0,2.0549,INLAND
7387,-118.25,33.97,37.0,794.0,210.0,814.0,213.0,2.2917,<1H OCEAN
14454,-117.27,32.83,39.0,1877.0,426.0,805.0,409.0,3.875,NEAR OCEAN
2927,-119.01,35.36,24.0,1941.0,484.0,1277.0,435.0,1.056,INLAND


**One Hot Encoder**

In [17]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [18]:
housing_cat.value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,7313
INLAND,5218
NEAR OCEAN,2110
NEAR BAY,1866
ISLAND,5


In [19]:
housing_pd_get_dummies = pd.get_dummies(housing['ocean_proximity'])
housing_pd_get_dummies

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
1380,False,False,False,True,False
12294,False,True,False,False,False
7387,True,False,False,False,False
14454,False,False,False,False,True
2927,False,True,False,False,False
...,...,...,...,...,...
19391,False,True,False,False,False
15393,True,False,False,False,False
9143,False,True,False,False,False
17679,True,False,False,False,False


**Creating own Transformer**

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin

# the indexes of the columns that we need
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinesAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self # the function is only transformer, not estiamtor

    def transform(self, X):
        room_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, room_per_household,  population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, room_per_household, population_per_household]

In [21]:
attr_adder = CombinesAttributeAdder(add_bedrooms_per_room=False)
housing_extra_attr = attr_adder.transform(housing.values)

In [22]:
housing_extra_attr

array([[-122.09, 38.0, 6.0, ..., 'NEAR BAY', 5.69647847959754,
        2.446618222470654],
       [-116.93, 33.93, 13.0, ..., 'INLAND', 5.312457454050374,
        2.2443839346494214],
       [-118.25, 33.97, 37.0, ..., '<1H OCEAN', 3.727699530516432,
        3.8215962441314555],
       ...,
       [-117.96, 34.48, 32.0, ..., 'INLAND', 6.34113712374582,
        2.6956521739130435],
       [-121.84, 37.32, 22.0, ..., '<1H OCEAN', 5.688679245283019, 4.7],
       [-118.32, 34.2, 36.0, ..., '<1H OCEAN', 6.360128617363344,
        2.6816720257234725]], dtype=object)

In [23]:
housing.values[0,:]

array([-122.09, 38.0, 6.0, 10191.0, 1882.0, 4377.0, 1789.0, 5.2015,
       'NEAR BAY'], dtype=object)

In [24]:
housing_extra_attr[0,:]

array([-122.09, 38.0, 6.0, 10191.0, 1882.0, 4377.0, 1789.0, 5.2015,
       'NEAR BAY', 5.69647847959754, 2.446618222470654], dtype=object)

**Normalizatsiya**

**MinMaxScaler**

In [25]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.572816,35.63518,28.581335,2634.711967,536.299782,1428.578307,499.190286,3.870252
std,2.007532,2.135189,12.599434,2197.525936,421.451129,1144.987836,383.786586,1.906476
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.8,33.93,18.0,1444.0,296.75,786.0,279.0,2.5625
50%,-118.495,34.26,29.0,2120.5,434.0,1168.5,408.0,3.5338
75%,-118.01,37.72,37.0,3143.0,642.0,1728.0,603.0,4.7387
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001


In [26]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
1380,-122.09,38.0,6.0,10191.0,1882.0,4377.0,1789.0,5.2015,NEAR BAY
12294,-116.93,33.93,13.0,7804.0,1594.0,3297.0,1469.0,2.0549,INLAND
7387,-118.25,33.97,37.0,794.0,210.0,814.0,213.0,2.2917,<1H OCEAN
14454,-117.27,32.83,39.0,1877.0,426.0,805.0,409.0,3.875,NEAR OCEAN
2927,-119.01,35.36,24.0,1941.0,484.0,1277.0,435.0,1.056,INLAND


In [29]:
xmin = housing['median_income'].min()
xmax = housing['median_income'].max()
xmin
xmax

15.0001

In [33]:
xnew = (housing['median_income'][0] - xmin) / (xmax - xmin)
xnew

np.float64(0.5396684183666433)

In [34]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

min_max_scaler.fit_transform(housing_numeric)

array([[0.2250996 , 0.58023379, 0.09803922, ..., 0.12259312, 0.29403059,
        0.3242438 ],
       [0.73904382, 0.1477152 , 0.23529412, ..., 0.09232322, 0.24140766,
        0.1072399 ],
       [0.60756972, 0.15196599, 0.70588235, ..., 0.02273046, 0.03486269,
        0.12357071],
       ...,
       [0.63645418, 0.20616366, 0.60784314, ..., 0.02250624, 0.0490051 ,
        0.28116854],
       [0.25      , 0.50797024, 0.41176471, ..., 0.0697329 , 0.08699227,
        0.26496186],
       [0.60059761, 0.17640808, 0.68627451, ..., 0.02329101, 0.05097846,
        0.24045875]])

In [35]:
housing_numeric

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
1380,-122.09,38.00,6.0,10191.0,1882.0,4377.0,1789.0,5.2015
12294,-116.93,33.93,13.0,7804.0,1594.0,3297.0,1469.0,2.0549
7387,-118.25,33.97,37.0,794.0,210.0,814.0,213.0,2.2917
14454,-117.27,32.83,39.0,1877.0,426.0,805.0,409.0,3.8750
2927,-119.01,35.36,24.0,1941.0,484.0,1277.0,435.0,1.0560
...,...,...,...,...,...,...,...,...
19391,-120.85,37.78,25.0,421.0,434.0,303.0,106.0,2.2679
15393,-116.90,33.22,11.0,4132.0,773.0,2012.0,703.0,3.1906
9143,-117.96,34.48,32.0,1896.0,342.0,806.0,299.0,4.5769
17679,-121.84,37.32,22.0,3015.0,581.0,2491.0,530.0,4.3419


**StandartScaler**

In [39]:
from sklearn.preprocessing import StandardScaler

standart_scaler = StandardScaler()

standart_scaler.fit_transform(housing_numeric)

array([[-1.25390838,  1.10757958, -1.79230424, ...,  2.57514612,
         3.36084902,  0.69829788],
       [ 1.31649014, -0.79863258, -1.2367069 , ...,  1.63187609,
         2.52702704, -0.9522322 ],
       [ 0.65894633, -0.77989831,  0.66819829, ..., -0.53677157,
        -0.74572422, -0.82802018],
       ...,
       [ 0.80340671, -0.54103634,  0.27134305, ..., -0.54375875,
        -0.52163456,  0.37066771],
       [-1.12937357,  0.78909696, -0.52236745, ...,  0.92791716,
         0.08028068,  0.24739988],
       [ 0.62407658, -0.67217624,  0.58882724, ..., -0.51930361,
        -0.49036624,  0.0610294 ]])

**Pipeline = Conveyer**