In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
url = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(url)

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

x_train = train_set.drop("median_house_value", axis=1)
y = train_set['median_house_value'].copy()

x_num = x_train.drop('ocean_proximity', axis=1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
# sonli ustunlar uchun PIPELINE

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])


In [7]:
# matnli ustunlar uchun Pipeline

In [8]:
from sklearn.compose import ColumnTransformer

num_attribs = list(x_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat',OneHotEncoder(), cat_attribs)
])

In [9]:
x_prepared = full_pipeline.fit_transform(x_train)

In [10]:
#linear regression

In [11]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

In [12]:
lr_model.fit(x_prepared, y)

LinearRegression()

In [13]:
test_data = x_train.sample(10)

In [15]:
test_labels = y.loc[test_data.index]
test_labels

1126      87300.0
8052     219100.0
6707     258300.0
416      335600.0
5974     180500.0
13223    165500.0
12856     72700.0
11465    252900.0
2027      60600.0
913      287600.0
Name: median_house_value, dtype: float64

In [16]:
test_data_prepared = full_pipeline.transform(test_data)

In [17]:
predicted_labels = lr_model.predict(test_data_prepared)

In [18]:
predicted_labels

array([ 66493.34601784, 265342.79456962, 244572.24448134, 304799.4726227 ,
       162352.83683808, 172224.5899674 ,  94926.49077482, 220668.69490137,
        93541.55156377, 288808.70890048])

In [19]:
pd.DataFrame({'Bashorat':predicted_labels,'Asl_qiymat':test_labels})

Unnamed: 0,Bashorat,Asl_qiymat
1126,66493.346018,87300.0
8052,265342.79457,219100.0
6707,244572.244481,258300.0
416,304799.472623,335600.0
5974,162352.836838,180500.0
13223,172224.589967,165500.0
12856,94926.490775,72700.0
11465,220668.694901,252900.0
2027,93541.551564,60600.0
913,288808.7089,287600.0


In [20]:
# modelni baholash

In [21]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [23]:
x_test = test_set.drop('median_house_value',axis=1)
x_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [24]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [25]:
# 1-orinda pipelinedan otkizishimiz kk

In [26]:
x_test_prepared = full_pipeline.transform(x_test)

In [27]:
y_predicted = lr_model.predict(x_test_prepared)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [28]:
# model bajolash mezonlaridan foydalanamiz

In [29]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)

print('MAE =', mae)

MAE = 50898.73953494079


In [31]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_predicted)

print('RMSE =',np.sqrt(rmse))

RMSE = 72701.32600762135


In [32]:
# RandomForest

In [33]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

rf_model.fit(x_prepared, y)

RandomForestRegressor()

In [34]:
y_predicted = rf_model.predict(x_test_prepared)

In [35]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)

print('rmse = ',np.sqrt(mse))

rmse =  50266.14227945418


In [36]:
# cross validation

In [40]:
x = df.drop('median_house_value',axis=1)
y = df['median_house_value'].copy()

x_prepared = full_pipeline.transform(x)

In [45]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(lr_model, x_prepared, y, scoring='neg_mean_squared_error', cv=5)
mse_scores

array([-5.38681502e+09, -5.59717065e+09, -5.68997624e+09, -5.86890635e+09,
       -4.38197413e+09])

In [58]:
def display_scores(scores):
    print("Scores :", scores)
    print('mean :',scores.mean())
    print('Std.dev:', scores.std())

In [59]:
display_scores(np.sqrt(-mse_scores))

Scores : [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
mean : 73289.27323295095
Std.dev: 3694.7136787223762


In [60]:
scores = cross_val_score(rf_model, x_prepared, y, scoring='neg_mean_squared_error', cv=10)

lr_rmse_scores = np.sqrt(-scores)

display_scores(lr_rmse_scores)

Scores : [96045.81102524 47567.53372123 64942.93793972 56921.96156666
 61155.54389331 60404.11130239 46835.75624935 79004.2725837
 74417.55817938 49175.46811949]
mean : 63647.0954580476
Std.dev: 14885.41354949942


In [62]:
# joblib

In [64]:
import joblib

filename = 'lr_model.jbl' # istalgan nom bersa boladi faylga
joblib.dump(lr_model, filename)

['lr_model.jbl']

In [65]:
# faylni chaqirish 

In [66]:
model = joblib.load(filename)

In [67]:
scores = cross_val_score(model, x_prepared, y, scoring='neg_mean_squared_error', cv=5)

lr_rmse_scores = np.sqrt(-scores)

display_scores(lr_rmse_scores)

Scores : [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
mean : 73289.27323295095
Std.dev: 3694.7136787223762
