In [2]:
!tree ~/Projects/BitSize_ML/

[01;34m/Users/nejat/Projects/BitSize_ML/[0m
└── [01;34mend_to_end_ml[0m
    ├── [01;34mdata[0m
    │   └── [00mhousing.csv[0m
    ├── [01;34mimages[0m
    │   └── [00mENV checking.png[0m
    ├── [01;34mmodels[0m
    └── [01;34mnotebooks[0m
        ├── [00m1_setup.ipynb[0m
        ├── [00m2_fetch_data.ipynb[0m
        ├── [00m3_overview_EDA.ipynb[0m
        ├── [00m4_data_split.ipynb[0m
        ├── [00m5_stratified_split.ipynb[0m
        ├── [00m6_visualizing_geospatial_data.ipynb[0m
        ├── [00m7_correlation_pairs.ipynb[0m
        ├── [00m8_feature_engineering.ipynb[0m
        └── [00m9_handling_missing_data.ipynb[0m

6 directories, 11 files


In [3]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.impute import SimpleImputer

def create_income_bins(df):
    df = df.copy()
    df['income_cat'] = pd.cut(df['median_income'],
                              bins=[0,1.5,3, 4.5,6,np.inf],
                              labels=[1,2,3,4,5])
    return df

CSV = Path.cwd().parent / "data" / "housing.csv"
df = pd.read_csv(CSV)
df_binned = create_income_bins(df)

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=27)
for train_idx, test_idx in split.split(df_binned,df_binned['income_cat']):
    train_strat = df_binned.loc[train_idx].drop(columns=['income_cat'])
    test_strat = df_binned.loc[test_idx].drop(columns=['income_cat'])
print(f"[Stratified], train_size={len(train_strat)}, test_size={len(test_strat)}")


[Stratified], train_size=16512, test_size=4128


In [5]:
df_train = train_strat.copy()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 15114 to 4065
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16351 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


## 1. Drop the entire feature: ( total_bedrooms)

In [6]:
df_drop = df_train.drop(axis=1, columns=['total_bedrooms'])
df_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 15114 to 4065
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   population          16512 non-null  float64
 5   households          16512 non-null  float64
 6   median_income       16512 non-null  float64
 7   median_house_value  16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


## 2. Drop the NA value rows

In [7]:
df_na_drop = df_train.dropna()
df_na_drop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16351 entries, 15114 to 4065
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16351 non-null  float64
 1   latitude            16351 non-null  float64
 2   housing_median_age  16351 non-null  float64
 3   total_rooms         16351 non-null  float64
 4   total_bedrooms      16351 non-null  float64
 5   population          16351 non-null  float64
 6   households          16351 non-null  float64
 7   median_income       16351 non-null  float64
 8   median_house_value  16351 non-null  float64
 9   ocean_proximity     16351 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


## 3. Fill the NA with values AKA impute

 #### (1) impute with fillna()

In [8]:
df_filled = df_train.copy()
median = df_filled['total_bedrooms'].median()
df_filled['total_bedrooms'] = df_filled['total_bedrooms'].fillna(median)
df_filled.info()


<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 15114 to 4065
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.4+ MB


 #### (2) impute with SimpleImputer

In [9]:
df_num = df_train.select_dtypes(include=[np.number])
imputer = SimpleImputer(strategy='median')
imputer.fit(df_num)
X= imputer.transform(df_num)
df_num_imputed = pd.DataFrame(X, columns=df_num.columns, index= df_num.index)
df_num_imputed.info()



<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 15114 to 4065
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
dtypes: float64(9)
memory usage: 1.3 MB
