<a href="https://colab.research.google.com/github/muhammadibrohimov-ai/Machine_Learning_Intro_California_housing/blob/main/MachineLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [127]:
 import pandas as pd
 import numpy as np
 import sklearn

In [128]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [129]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop('median_house_value', axis = 1)
y = train_set['median_house_value'].copy()

In [130]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class MulptipleAttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedroom_per_room = True):
        self.add_bedroom_per_room = add_bedroom_per_room

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        bedrooms_per_household = X[:, bedrooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]

        if self.add_bedroom_per_room:
            bedroom_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, bedrooms_per_household, population_per_household, bedroom_per_room]

        else :
            return np.c_[X, rooms_per_household, bedrooms_per_household, population_per_household]

In [131]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attr_adder', MulptipleAttributeAdder(add_bedroom_per_room=True)),
    ('std_scaler', StandardScaler())
])

In [132]:
from sklearn.compose import ColumnTransformer

num_attr = list(X_train.drop('ocean_proximity', axis = 1).columns)
cat_attr = ['ocean_proximity']

full_pipline = ColumnTransformer([
    ('num_pipline', numeric_pipline, num_attr),
    ('cat_pipline', OneHotEncoder(), cat_attr)
])

In [134]:
X_prepared = full_pipline.fit_transform(X_train)

In [135]:
X_prepared[:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646, -0.20836543,
         0.05137609, -0.2117846 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.12853018,
        -0.11736222,  0.34218528,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.25753771,
        -0.03227969, -0.66165785,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532, -0.14515634,
         0.07750687,  0.78303162,  0.        ,  0.        ,  0.        ,
         0.        

In [136]:
# Linear Regression

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [163]:
LR_model.fit(X_prepared, y)

In [143]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [144]:
y_test = test_set['median_house_value'].copy()
y_test

Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0
...,...
15362,263300.0
16623,266800.0
18086,500001.0
2144,72300.0


In [145]:
X_test_prepared = full_pipline.fit_transform(X_test)
X_test_prepared[:5, :]

array([[ 0.25541734,  0.22194113, -0.30073951, -0.50056608, -0.24196995,
        -0.02654818, -0.3578599 , -1.14237625, -0.43367721,  0.09850613,
         0.52022395,  0.76724738,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.02976613, -0.20947715,  0.098724  ,  0.15038025, -0.24196995,
         0.12876364,  0.22245041, -0.69050482, -0.13050312, -0.50011959,
        -0.16293193, -0.82211069,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ],
       [-1.46454628,  1.03788441,  1.85636346,  0.55190278, -0.24196995,
        -0.10016419,  1.19995088, -0.18616932, -0.51063381, -0.87602643,
        -0.91557569, -1.20734433,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.400837  , -0.61275946, -0.93988113,  0.19926913, -0.24196995,
         0.2544495 , -0.00709456,  1.01380686,  0.27182354, -0.32838724,
         0.27323743, -0.88099247,  1.        ,  0.        ,  0.        ,
         0.        

In [146]:
y_test_predicted = LR_model.predict(X_test_prepared)
predicted_test_set = pd.DataFrame({"Bashorat":y_test_predicted, "Real narx":y_test})
predicted_test_set

Unnamed: 0,Bashorat,Real narx
20046,54273.070361,47700.0
3024,127908.041084,45800.0
15663,273628.345526,500001.0
20484,272614.532867,218600.0
9814,262726.064475,278000.0
...,...,...
15362,224161.863429,263300.0
16623,239835.120795,266800.0
18086,458559.222773,500001.0
2144,121399.450188,72300.0


In [147]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_test_predicted)
np.sqrt(mse)

np.float64(70602.19532227781)

In [151]:
# Decision Tree

from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [164]:
y_tree_predicted = Tree_model.predict(X_test_prepared)

In [166]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_tree_predicted)
np.sqrt(mse)

np.float64(105819.43565736526)

In [167]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [168]:
y_rf_predicted = RF_model.predict(X_test_prepared)

In [169]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_rf_predicted)
np.sqrt(mse)

np.float64(77726.32544481836)

In [170]:
# Cross Validation

X = df.drop('median_house_value', axis = 1)
y = df['median_house_value'].copy()

X_prepared = full_pipline.fit_transform(X, y)

In [176]:
def display_score(scores):
    print("Scores : ", scores)
    print("Mean_score: ", scores.mean())
    print("Standart deviation: ", scores.std())

In [177]:
from sklearn.model_selection import cross_val_score

In [179]:
# Linear Regression
scores = cross_val_score(LR_model, X_prepared, y, scoring = 'neg_mean_squared_error', cv = 10)
LR_rmse_scores = np.sqrt(-scores)
display_score(LR_rmse_scores)

Scores :  [83687.72932001 58860.4379738  85685.309984   61831.71816256
 80922.74499547 68530.02212589 52580.68246784 90747.90044291
 77612.85601609 53561.78051385]
Mean_score:  71402.11820023955
Standart deviation:  13371.910951419752


In [180]:
# Decision Tree
scores = cross_val_score(Tree_model, X_prepared, y, scoring = 'neg_mean_squared_error', cv = 10)
LR_rmse_scores = np.sqrt(-scores)
display_score(LR_rmse_scores)

Scores :  [128449.75971758  75296.45958802  83287.51102479  76155.97406434
  89218.38667784  76221.35365983  67578.17556294  98960.5120692
  93526.0053826   72728.74366883]
Mean_score:  86142.28814159786
Standart deviation:  16931.31208355461


In [181]:
# Random Forest
scores = cross_val_score(RF_model, X_prepared, y, scoring = 'neg_mean_squared_error', cv = 10)
LR_rmse_scores = np.sqrt(-scores)
display_score(LR_rmse_scores)

Scores :  [94773.02103298 47776.29111429 65476.06901612 56369.76425678
 60643.75103469 59772.93988438 46831.85467345 78372.22851125
 74185.29889801 49288.7599678 ]
Mean_score:  63348.99783897669
Standart deviation:  14549.736424429084


In [182]:
# Saving model (pickle)

import pickle

file_name = "RF_model.pkl"

with open(file_name, "wb") as file:
    pickle.dump(RF_model, file)

In [183]:
with open(file_name, 'rb') as file:
    model = pickle.load(file)

In [184]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv = 10)
RF_rmse_scores = np.sqrt(-scores)
display_score(RF_rmse_scores)

Scores :  [95477.36154495 47582.13908309 65214.85200674 56321.25932456
 60519.39087552 59508.25472047 47096.22007488 78523.09067728
 74328.00371336 49072.73320834]
Mean_score:  63364.330522920194
Standart deviation:  14748.586110268529


In [185]:
# Saving model (joblib)

import joblib
joblib.dump(RF_model, "RF_model.jbl")

['RF_model.jbl']

In [186]:
model = joblib.load("RF_model.jbl")

In [187]:
scores = cross_val_score(model, X_prepared, y, scoring = "neg_mean_squared_error", cv = 10)
RF_model_scores = np.sqrt(-scores)
display_score(RF_model_scores)

Scores :  [96934.62480592 47424.79973311 65992.41405524 56342.29848885
 60842.52920001 59983.17214359 47103.51088845 78344.77981849
 74055.53643677 49344.83428647]
Mean_score:  63636.84998569205
Standart deviation:  15013.242078935484


In [188]:
joblib.dump(full_pipline, "pipline.jbl")

['pipline.jbl']