In [9]:
# Predict median house cost in California Project
import pandas as pd
import numpy as np
import sklearn # scikit-learn

# Dataset
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

# train-test sets
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)


In [10]:
# Our transfromer
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

In [12]:
num_pipeline.fit_transform(X_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [13]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [14]:
housing_prepared = full_pipeline.fit_transform(X_train)
X_prepared = housing_prepared
housing_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

Linear Regression

In [16]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, y)

In [17]:
test_data = X_train.sample(5)
test_label = y.loc[test_data.index]
test_data_full = full_pipeline.transform(test_data)
test_data_full

array([[-0.43264492,  1.94554414,  1.45941464, -0.22671255, -0.19689249,
        -0.40584353, -0.2125884 , -1.10167751, -0.12832757, -0.06856379,
        -0.01148846,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.75902269, -0.70352373,  0.90395244,  0.00919497, -0.38066582,
        -0.2422583 , -0.29658751,  2.41994433,  0.60458107, -0.01060605,
        -1.21537164,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.57952464, -0.63799909,  0.50719373, -0.83694508, -0.80787916,
        -0.20180174, -0.77958238, -0.31206961, -0.58056065,  0.24179438,
         0.52507535,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.1208074 , -0.0248756 ,  0.34849025,  0.27499332,  0.27566752,
         0.33644645,  0.3045311 , -0.78107653, -0.07351174, -0.01384241,
        -0.18978156,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.47473083,  0.99075643,  1

In [19]:
predicted_data = LR_model.predict(test_data_full)
pd.DataFrame({'Prediction': predicted_data, 'Real_value': test_label})

Unnamed: 0,Prediction,Real_value
11853,37980.958263,55900.0
6395,365889.67289,500001.0
3420,177829.5275,133300.0
3053,117391.458749,71900.0
16106,262154.515954,272200.0


# Check Model

In [20]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [22]:
X_test = test_set.drop('median_house_value', axis=1)
y_test = test_set['median_house_value'].copy()
X_test_prepared = full_pipeline.transform(X_test)
y_predicted = LR_model.predict(X_test_prepared)

In [23]:
# RMSE
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse) # not bad but not good let's use another algorithm

72701.32600762138


In [25]:
# Model with Decision Tree
from sklearn.tree import DecisionTreeRegressor
tr_model = DecisionTreeRegressor()
tr_model.fit(X_prepared, y)

In [27]:
y_predicted = tr_model.predict(X_test_prepared)

In [29]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

71971.72980166346


In [30]:
# Model with RandomForest
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [31]:
# check
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse) # very good

49932.75205387538


# Cross-Validation

In [46]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [34]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [47]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [84188.51219065 61197.24357613 86752.24346334 62289.14292385
 80540.40041898 68919.39949642 52503.82940087 90910.07884989
 77674.67507925 53941.60539478]
Mean: 71891.71307941683
Std.dev: 13249.525989444988


In [48]:
#### Random Forest
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [96824.06710231 46906.33395765 65337.19968227 56537.23563435
 60910.29911877 60174.41419292 47333.68461872 79606.45192248
 74327.23264781 49449.24787417]
Mean: 63740.61667514526
Std.dev: 15133.756465823353


# Saving_model

In [56]:
# pickle
import pickle

filename = "california_predict_RF.pkl"
with open(filename, 'wb') as file:
  pickle.dump(RF_model, file)

In [60]:
with open(filename, 'rb') as file:
  model_rf = pickle.load(file)

In [58]:
model_rf

In [59]:
import pickle

filename = "california_predict_LR.pkl"
with open(filename, 'wb') as file:
  pickle.dump(LR_model, file)

In [61]:
with open(filename, 'rb') as file:
  model_lr = pickle.load(file)

In [62]:
model_lr

In [63]:
# joblib
import joblib

filename = 'RF_model.jbl'
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [64]:
import joblib

filename = 'LR_model.jbl'
joblib.dump(LR_model, filename)

['LR_model.jbl']

In [65]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']

In [None]:
# finished