In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression

In [4]:
housing = pd.read_csv('datasets/housing/housing.csv')

In [5]:
housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6., np.inf], labels= [1, 2, 3, 4, 5])

In [6]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split (housing, housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [7]:
strat_test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
5241,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,500001.0,<1H OCEAN,5
17352,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,162500.0,<1H OCEAN,4
3505,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,204600.0,<1H OCEAN,3
7777,-118.10,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,159700.0,<1H OCEAN,3
14155,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,184000.0,NEAR OCEAN,3
...,...,...,...,...,...,...,...,...,...,...,...
12182,-117.29,33.72,19.0,2248.0,427.0,1207.0,368.0,2.8170,110000.0,<1H OCEAN,2
7275,-118.24,33.99,33.0,885.0,294.0,1270.0,282.0,2.1615,118800.0,<1H OCEAN,2
17223,-119.72,34.44,43.0,1781.0,342.0,663.0,358.0,4.7000,293800.0,<1H OCEAN,4
10786,-117.91,33.63,30.0,2071.0,412.0,1081.0,412.0,4.9125,335700.0,<1H OCEAN,4


In [8]:
for set_ in (strat_train_set, strat_test_set): 
    set_.drop ("income_cat", axis=1, inplace=True)

In [9]:
housing = strat_train_set.drop("median_house_value", axis = 1)
housing_labels = strat_train_set["median_house_value"].copy()

In [10]:
housing_num = housing.drop("ocean_proximity", axis = 1)

In [11]:
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing_num)
X = imputer.transform(housing_num)

In [12]:
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace = True)
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 12655 to 19773
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [13]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [15]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 12655 to 19773
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [16]:

num_pipeline = Pipeline ([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()), 
    ('std_scaler', StandardScaler()),
])

num_attribs = list(housing_num) 
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [17]:
housing_prepared.shape

(16512, 16)

In [18]:
housing.shape

(16512, 9)

In [19]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [20]:
housing_labels

12655     72100.0
15502    279600.0
2908      82700.0
14053    112500.0
20496    238300.0
           ...   
15174    268500.0
12661     90400.0
19263    140400.0
19140    258100.0
19773     62700.0
Name: median_house_value, Length: 16512, dtype: float64

In [15]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [16]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]


In [17]:
print("Labels:", list(some_labels))

Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [18]:
# some_data = housing.iloc[:2000]
# some_labels = housing_labels.iloc[:2000]
# X = imputer.transform(some_data.drop("ocean_proximity", axis = 1))  
# some_data_prepared = full_pipeline.fit_transform(some_data)
# some_data_prepared.shape

In [19]:
print("Прогнозы: ", lin_reg.predict(some_data_prepared))
print("Метки: ", list(some_labels))

Прогнозы:  [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]
Метки:  [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]


In [20]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68627.87390018745

In [21]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)


In [22]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [23]:
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("Суммы оценок: ", scores)
    print("Среднее: ", scores.mean())
    print("Стандартное отклонение: ", scores.std())

In [24]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
tree_rmse_scores = np.sqrt(-scores)

In [25]:
display_scores(tree_rmse_scores)

Суммы оценок:  [72498.40673407 71368.42481157 69017.52383745 71758.34357316
 72989.83574297 77980.52652629 73411.93208072 73934.39251723
 68511.14001678 70376.96507811]
Среднее:  72184.74909183549
Стандартное отклонение:  2579.68644202766


In [26]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
lin_rmse_scores = np.sqrt(-lin_scores)

In [27]:
display_scores(lin_rmse_scores)

Суммы оценок:  [71774.15825125 64114.99166359 67771.17124356 68627.7242824
 66848.58938823 72527.24321966 74023.75820358 68800.50640085
 66443.28836884 70147.43163952]
Среднее:  69107.88626614638
Стандартное отклонение:  2886.049299226037


In [28]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [29]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

18635.49384431948

In [30]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [31]:
display_scores(forest_rmse_scores)

Суммы оценок:  [51746.44867054 48988.25397702 46951.21976196 52384.07949388
 47660.26645751 51624.3019513  52391.7790523  50069.11949599
 48752.56867038 53845.87568672]
Среднее:  50441.39132175971
Стандартное отклонение:  2174.6099672354253


In [32]:
import joblib
joblib.dump(forest_reg, "forest_reg.pkl")

['forest_reg.pkl']

In [33]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv = 5, scoring = 'neg_mean_squared_error', return_train_score=True)

grid_search.fit(housing_prepared, housing_labels)

In [34]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [35]:
grid_search.best_estimator_

In [36]:
cvrse = grid_search.cv_results_
for mean_score, params in zip(cvrse["mean_test_score"], cvrse["params"]):
    print(np.sqrt(-mean_score), params)

64068.174055057556 {'max_features': 2, 'n_estimators': 3}
55565.49849467017 {'max_features': 2, 'n_estimators': 10}
52642.46695617762 {'max_features': 2, 'n_estimators': 30}
60284.84932615271 {'max_features': 4, 'n_estimators': 3}
53142.30348416233 {'max_features': 4, 'n_estimators': 10}
50513.81574885257 {'max_features': 4, 'n_estimators': 30}
59704.513966039056 {'max_features': 6, 'n_estimators': 3}
51729.88295862407 {'max_features': 6, 'n_estimators': 10}
49744.957926800686 {'max_features': 6, 'n_estimators': 30}
58599.77242617453 {'max_features': 8, 'n_estimators': 3}
52580.70340567057 {'max_features': 8, 'n_estimators': 10}
49863.11479771287 {'max_features': 8, 'n_estimators': 30}
62633.47090480547 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
54421.047860778075 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
60082.58051508525 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52368.00802669267 {'bootstrap': False, 'max_features': 3, 'n_estimators'

In [37]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.43691622e-02, 6.67912215e-02, 4.36169665e-02, 1.77909281e-02,
       1.67157464e-02, 1.71021882e-02, 1.57028828e-02, 3.33904550e-01,
       5.22150948e-02, 1.08838828e-01, 8.78134409e-02, 1.16236579e-02,
       1.45697206e-01, 1.00080618e-04, 2.68019258e-03, 5.03785408e-03])

In [38]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedroms_per_room"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.3339045503919998, 'median_income'),
 (0.14569720550124174, 'INLAND'),
 (0.10883882755369981, 'pop_per_hhold'),
 (0.08781344090188153, 'bedroms_per_room'),
 (0.07436916219855068, 'longitude'),
 (0.0667912214853125, 'latitude'),
 (0.05221509481950784, 'rooms_per_hhold'),
 (0.04361696646049262, 'housing_median_age'),
 (0.017790928077070455, 'total_rooms'),
 (0.01710218823305605, 'population'),
 (0.016715746447962372, 'total_bedrooms'),
 (0.015702882767654344, 'households'),
 (0.011623657890922404, '<1H OCEAN'),
 (0.005037854077403172, 'NEAR OCEAN'),
 (0.0026801925756284504, 'NEAR BAY'),
 (0.00010008061761623913, 'ISLAND')]

In [39]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis = 1)
Y_test = strat_test_set["median_house_value"].copy()
X = imputer.transform(X_test.drop("ocean_proximity", axis = 1))  
X_test_prepared = full_pipeline.transform(X_test)

In [40]:
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [41]:
final_rmse

47525.38177645935

In [42]:
from scipy import stats
confidence = 0.95
squared_erors = (final_predictions - Y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_erors) - 1, loc = squared_erors.mean(), scale = stats.sem(squared_erors)))


array([45549.96909811, 49421.89940864])

In [43]:
from sklearn.svm import SVR

param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]

svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ..............................C=10.0, kernel=linear; total time=   5.2s
[CV] END ..............................C=10.0, kernel=linear; total time=   5.2s
[CV] END ..............................C=10.0, kernel=linear; total time=   5.2s
[CV] END ..............................C=10.0, kernel=linear; total time=   5.1s
[CV] END ..............................C=10.0, kernel=linear; total time=   5.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.0s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.1s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.0s
[CV] END ..............................C=30.0, kernel=linear; total time=   5.1s
[CV] END .............................C=100.0, kernel=linear; total time=   5.0s
[CV] END .............................C=100.0, 

[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  10.3s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  10.7s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  10.4s
[CV] END .....................C=10.0, gamma=0.01, kernel=rbf; total time=  10.3s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  10.3s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  10.3s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  10.3s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  10.2s
[CV] END .....................C=10.0, gamma=0.03, kernel=rbf; total time=  10.2s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  10.2s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  10.2s
[CV] END ......................C=10.0, gamma=0.1, kernel=rbf; total time=  10.2s
[CV] END ...................

[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  10.0s
[CV] END .....................C=300.0, gamma=0.1, kernel=rbf; total time=  10.0s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=   9.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=   9.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=  10.0s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=   9.9s
[CV] END .....................C=300.0, gamma=0.3, kernel=rbf; total time=   9.9s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=   9.8s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=   9.7s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=   9.7s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=   9.7s
[CV] END .....................C=300.0, gamma=1.0, kernel=rbf; total time=   9.7s
[CV] END ...................

In [45]:
negative_mse = grid_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

70286.61835658147

In [46]:
grid_search.best_params_

{'C': 30000.0, 'kernel': 'linear'}