<a href="https://colab.research.google.com/github/muhammadibrohimov-ai/Machine_Learning_Intro_California_housing/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
 import pandas as pd
 import numpy as np
 import sklearn

In [93]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [94]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_data.drop('median_house_value', axis = 1)
y = train_data['median_house_value'].copy()

In [95]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class MulptipleAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room = True):
        self.add_bedroom_per_room = add_bedroom_per_room

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        bedrooms_per_household = X[:, bedrooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]

        if self.add_bedroom_per_room:
            bedroom_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, bedrooms_per_household, population_per_household, bedroom_per_room]

        else :
            return np.c_[X, rooms_per_household, bedrooms_per_household, population_per_household]

In [96]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', MulptipleAttributeAdder(add_bedroom_per_room=True)),
    ('std_scaler', StandardScaler())
])

In [97]:
from sklearn.compose import ColumnTransformer

num_attr = list(X_train.drop('ocean_proximity', axis = 1).columns)
cat_attr = ['ocean_proximity']

full_pipline = ColumnTransformer([
    ('num_pipline', numeric_pipline, num_attr),
    ('cat_pipline', OneHotEncoder(), cat_attr)
])

In [98]:
X_prepared = full_pipline.fit_transform(X_train)

In [99]:
X_prepared[:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646, -0.20836543,
         0.05137609, -0.2117846 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.12853018,
        -0.11736222,  0.34218528,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.25753771,
        -0.03227969, -0.66165785,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532, -0.14515634,
         0.07750687,  0.78303162,  0.        ,  0.        ,  0.        ,
         0.        

In [100]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [101]:
LR_model.fit(X_prepared, y)

In [102]:
test_data = X_train.sample(5)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17368,-120.43,34.93,4.0,2866.0,648.0,1311.0,578.0,2.8649,<1H OCEAN
3582,-118.55,34.24,21.0,5751.0,1082.0,2230.0,1016.0,4.3458,<1H OCEAN
12804,-121.45,38.61,46.0,1758.0,511.0,1094.0,484.0,1.0685,INLAND
7388,-118.25,33.96,43.0,1876.0,454.0,1571.0,458.0,2.0323,<1H OCEAN
4668,-118.3,34.05,36.0,1723.0,569.0,1664.0,501.0,1.9323,<1H OCEAN


In [103]:
test_label = y.loc[test_data.index]
test_label

Unnamed: 0,median_house_value
17368,186500.0
3582,407500.0
12804,70000.0
7388,112500.0
4668,161100.0


In [104]:
test_prepared = full_pipline.transform(test_data)
test_prepared

array([[-0.42267281, -0.33377751, -1.95271028,  0.10300615,  0.26134752,
        -0.10153982,  0.20478216, -0.5334707 , -0.19970552,  0.05637682,
        -0.07158118,  0.2283202 ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.5147059 , -0.65672041, -0.60373066,  1.42969858,  1.29716087,
         0.70671195,  1.35451995,  0.24421641,  0.09433159, -0.0732318 ,
        -0.07791058, -0.42619818,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [-0.93125062,  1.38858464,  1.3800629 , -0.40651731, -0.06562582,
        -0.29238926, -0.04196522, -1.47684104, -0.75524721, -0.09441249,
        -0.07225792,  1.34175488,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.66428761, -0.78776971,  1.14200767, -0.35225398, -0.20166582,
         0.12712771, -0.11021449, -0.97070637, -0.56095325, -0.24334724,
         0.02877513,  0.50257739,  1.        ,  0.        ,  0.        ,
         0.        

In [105]:
test_predicted = LR_model.predict(test_prepared)
test_predicted

array([208298.51272419, 267673.74248434,  86463.97367935, 156073.285959  ,
       168566.88411041])

In [106]:
predicted_test = pd.DataFrame({"bashorat_qilingan":test_predicted, "haqiqiy_narx":test_label})
predicted_test

Unnamed: 0,bashorat_qilingan,haqiqiy_narx
17368,208298.512724,186500.0
3582,267673.742484,407500.0
12804,86463.973679,70000.0
7388,156073.285959,112500.0
4668,168566.88411,161100.0
