In [80]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn

In [81]:
housing = pd.read_csv('housing.csv')

In [82]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [83]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


207 brakujących wartości w kolumnie total_bedrooms. Zastąpimy je medianą reszty, pełnych rekordów.

In [84]:
from sklearn.impute import SimpleImputer
imputer =SimpleImputer(strategy='median')

Jednak najpierw musimy zająć się kolumną z danymi kategorycznymi (ocean_proximity). Zastosujemy tu OneHotEncoding.

In [85]:
ocean_proximity_encoded = pd.get_dummies(housing['ocean_proximity'])
housing = pd.concat([housing, ocean_proximity_encoded], axis = 1).drop('ocean_proximity', axis = 1)

Teraz możemy uzupełnić brakujące wartości total_bedrooms.

In [86]:
imputer.fit(housing);
result = imputer.transform(housing);
transformed_df = pd.DataFrame(result, columns = housing.columns)
transformed_df.info()
housing = transformed_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   <1H OCEAN           20640 non-null  float64
 10  INLAND              20640 non-null  float64
 11  ISLAND              20640 non-null  float64
 12  NEAR BAY            20640 non-null  float64
 13  NEAR OCEAN          20640 non-null  float64
dtypes: float64(14)
memory usage: 2.2 MB


Następnie stworzymy dodatkowe kolumny, które aż się proszą o dodanie do ramki danych. Będzie to np. rooms_per_household czy population_per_household.

In [87]:
housing['rooms_per_household'] = housing.total_rooms / housing.households
housing['population_per_household'] = housing.population / housing.households
housing['rooms_per_person'] = housing.total_rooms / housing.population

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,rooms_per_household,population_per_household,rooms_per_person
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0,6.984127,2.555556,2.732919
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.0,0.0,0.0,1.0,0.0,6.238137,2.109842,2.956685
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0,8.288136,2.80226,2.957661
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0,5.817352,2.547945,2.283154
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0,6.281853,2.181467,2.879646


Teraz sprawdźmy, czy nowe kolumny są przydatne. Sprawdźmy jak ich wartości korelują z ceną domu.

In [88]:
correlation = housing.corr()
correlation = correlation.median_house_value.sort_values(ascending=False)
correlation

median_house_value          1.000000
median_income               0.688075
<1H OCEAN                   0.256617
rooms_per_person            0.209482
NEAR BAY                    0.160284
rooms_per_household         0.151948
NEAR OCEAN                  0.141862
total_rooms                 0.134153
housing_median_age          0.105623
households                  0.065843
total_bedrooms              0.049457
ISLAND                      0.023416
population_per_household   -0.023737
population                 -0.024650
longitude                  -0.045967
latitude                   -0.144160
INLAND                     -0.484859
Name: median_house_value, dtype: float64

Jak widać rooms_per_person i rooms_per_household są bardzo przydatnymi wartościami (mają wysoki moduł korelacji). Nie tyczy się to population_per_household, które jest drugą najgorszą (najmniej znaczącą dla ceny) zmienną. Usuńmy więc te kolumnę. Dla czyteności przeniśmy kolumne będącą Targetem na ostatnie miejsce.

In [89]:
housing = housing.drop('population_per_household', axis = 1)

Usuwamy obserwacje o wartości zmiennej celu 500 000. Te wartości zostały ucięte i zaburzają model. Konsultowane z prowadzącą

In [90]:
old_housing = housing.copy(deep = True)

In [91]:
housing = housing.loc[housing['median_house_value'] < 4.9e5].reset_index(drop=True)

Zostaje nam teraz podzielić nasz zbiór na testowy i treningowy i nauczyć model.

In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    housing.drop('median_house_value', axis = 1), 
    housing['median_house_value'], 
    test_size=0.30, random_state=42)

### Model RandomForest - czarna skrzynka
Parametry dobrane metodą `GridSearchCV`.

In [97]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
random_forest = RandomForestRegressor(n_estimators = 450,
                            max_depth = 8)
random_forest.fit(X_train, y_train)
y_test_hat = random_forest.predict(X_test)

print(f'RMSE : {np.sqrt(mean_squared_error(y_test, y_test_hat)):.3f}')

RMSE : 50921.137


In [99]:
import pickle
pickle.dump(random_forest, open("random_forest", 'wb'))
housing.to_csv('housing_preprocessed.csv')