<a href="https://colab.research.google.com/github/muhammadibrohimov-ai/Machine_Learning_Intro_California_housing/blob/main/ML_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn # scikit - learn library

In [None]:
# Online dataset
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# simple splitting dataset without making it balanced
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
# making balanced split
df['income_category'] = pd.cut(df['median_income'], bins = [0., 1.5, 3.0, 4.5, 6.0, np.inf], labels = [1, 2, 3, 4, 5])
from sklearn.model_selection import StratifiedShuffleSplit
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in strat_split.split(df,df['income_category']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
print(train_set.shape, test_set.shape)

(16512, 10) (4128, 10)


In [None]:
housing = strat_train_set.drop('median_house_value', axis = 1) # we have a special label to predict so we need it to split
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# Working with NaN data
# We have a NaN values on our 'total_bedrooms' columns so we can
# 1.DROP only NAN Values
# 2.DROP whole column
# 3.Fill NAN values with median Values

# We will go on third way but with more complicated and professionalized way

from sklearn.impute import SimpleImputer
# imputer is an object created from the class SimpleImputer
imputer = SimpleImputer(strategy="median") # strategies = "median", "most_frequent", "constant", "mean"

In [None]:
housing_numeric = housing.drop('ocean_proximity', axis = 1)
imputer.fit(housing_numeric) # it works only with numeric values

In [None]:
# the results are saved on the statistics_ sttribute
imputer.statistics_

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155,    3.     ])

In [None]:
# and now to save the changes we need to use .transform() method
X = imputer.transform(housing_numeric)

In [None]:
# .transform() method returns numpy array
type(X)

numpy.ndarray

In [None]:
# we will create new dataframe by using transformed dataset
housing_tr = pd.DataFrame(X, columns = housing_numeric.columns, index = housing_numeric.index)
housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,income_category
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,2.0
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,5.0
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,2.0
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,2.0
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,3.0


In [None]:
# now we will work with text values , we need to make them numeric , so the machnes will understand that
housing_cat = housing[['ocean_proximity']]
housing_cat.value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,7277
INLAND,5262
NEAR OCEAN,2124
NEAR BAY,1847
ISLAND,2


In [None]:
# Machine Learning always works with numric values, so there are two way to convert text into numeric values
# 1 OrdinalEncoder

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[1.],
       [4.],
       [1.],
       [4.],
       [0.],
       [3.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [None]:
# 2 OneHotEncoder

from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
housing_cat_hot_encoded = one_hot_encoder.fit_transform(housing_cat)
housing_cat_hot_encoded.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [None]:
# There are 3 types ofobjects in sklearn:
# estimator = those which have .fit() methods, there are fitting to the data and estimating it
# transformers = those which have fit_transform() methods, fitting to the data and alse returning new data
# predictors = those which have .predict() methods, they are predicting values

# We will create our own Transformer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# the index of the columns we need will be taken
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class MultipleAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y = None):
        return self # our function is transformer not estimator

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # adding bedrooms per room is optional
            bedroom_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedroom_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
attribute_adder = MultipleAttributeAdder(add_bedrooms_per_room=True)
housing_extra_attributes = attribute_adder.transform(housing.values)
housing_extra_attributes[0, :]

array([-121.46, 38.52, 29.0, 3873.0, 797.0, 2237.0, 706.0, 2.1736,
       'INLAND', 2, 5.485835694050992, 3.168555240793201,
       0.20578363026077975], dtype=object)

In [None]:
attribute_adder = MultipleAttributeAdder(add_bedrooms_per_room=False)
housing_extra_attributes = attribute_adder.transform(housing.values)
housing_extra_attributes[0, :]

array([-121.46, 38.52, 29.0, 3873.0, 797.0, 2237.0, 706.0, 2.1736,
       'INLAND', 2, 5.485835694050992, 3.168555240793201], dtype=object)

In [None]:
# in most columns the values are in different diapozone so we need to strandartize them

**MIN MAX SCALING**

![](https://arshpreetsingh.files.wordpress.com/2017/03/normal.png)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_mix_scaler = MinMaxScaler()

min_mix_scaler.fit_transform(housing_numeric)

array([[0.28784861, 0.63549416, 0.54901961, ..., 0.13144137, 0.11542599,
        0.25      ],
       [0.70916335, 0.05844846, 0.11764706, ..., 0.14301718, 0.40257376,
        1.        ],
       [0.52888446, 0.30074389, 0.84313725, ..., 0.05563854, 0.16379774,
        0.25      ],
       ...,
       [0.1623506 , 0.62699256, 0.92156863, ..., 0.0317401 , 0.18481124,
        0.5       ],
       [0.16434263, 0.61317747, 0.25490196, ..., 0.09316654, 0.25492752,
        0.5       ],
       [0.22011952, 0.78958555, 0.50980392, ..., 0.03640777, 0.18151474,
        0.5       ]])

**STANDART SCALER**

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQxeMnm1BSa8qyNkCB6wqiYa790XtNxCdM48k_be-9zESUvivv_nuY6SXk8fpJK_psGiA&usqp=CAU)

In [None]:
from sklearn.preprocessing import StandardScaler
standart_scaler = StandardScaler()
standart_scaler.fit_transform(housing_numeric)

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.55628602,
        -0.8936472 , -0.95468705],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.72131799,
         1.292168  ,  1.89007821],
       [ 0.26758118, -0.1259716 ,  1.22045984, ..., -0.52440722,
        -0.52543365, -0.95468705],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ..., -0.86511838,
        -0.36547546, -0.00643196],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.01061579,
         0.16826095, -0.00643196],
       [-1.28105026,  2.02567448, -0.13148926, ..., -0.79857323,
        -0.390569  , -0.00643196]])

In [None]:
# ONE HOT ENCODING IN PANDAS

housing_onehot = pd.get_dummies(housing['ocean_proximity'])
housing_onehot.replace({False:0, True:1})

  housing_onehot.replace({False:0, True:1})


Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
12655,0,1,0,0,0
15502,0,0,0,0,1
2908,0,1,0,0,0
14053,0,0,0,0,1
20496,1,0,0,0,0
...,...,...,...,...,...
15174,1,0,0,0,0
12661,0,1,0,0,0
19263,1,0,0,0,0
19140,1,0,0,0,0
