In [2]:
import os
import pandas as pd
import numpy as np

try:

    #dataset from cli
    data = pd.read_csv('./housing.csv')

    #Max fill function for categorical columns
    data['ocean_proximity'].fillna(data['ocean_proximity'].value_counts()
    .idxmax(), inplace=True)
    #Create imputation
    threshold = 0.7
    #Dropping columns with missing value rate higher than threshold
    data = data[data.columns[data.isnull().mean() < threshold]]
    #Dropping rows with missing value rate higher than threshold
    data = data.loc[data.isnull().mean(axis=1) < threshold]

    #Categorical
    conditions = [data['ocean_proximity'].str.contains('<1H OCEAN'),
        data['ocean_proximity'].str.contains('INLAND'),
        data['ocean_proximity'].str.contains('NEAR BAY'),
        data['ocean_proximity'].str.contains('ISLAND'),
        data['ocean_proximity'].str.contains('NEAR OCEAN')]
    choices = ['No', 'No', 'Yes', 'Yes', 'Yes']
    data['water_access'] = np.select(conditions, choices, default='Other')
    data['water_access'] = data['water_access'].astype(str)

    #Create one hot
    encoded_columns = pd.get_dummies(data['ocean_proximity'])
    data = data.join(encoded_columns).drop('ocean_proximity', axis=1)
    data.rename(columns = {'<1H OCEAN':'OH_OCEAN'}, inplace = True)
    data.rename(columns = {'NEAR BAY':'NEAR_BAY'}, inplace = True)
    data.rename(columns = {'NEAR OCEAN':'NEAR_OCEAN'}, inplace = True)

    #Create bins
    #Numerical
    data['age_bin'] = pd.cut(data['housing_median_age'], bins=[0,20,50,120], labels=["Young", "Middle-Aged", "Aged"])
    data['age_bin'] = data['age_bin'].astype(str)

    #Create log transformations
    #Log Transform Example
    data['latitude2'] = pd.DataFrame({'latitude':[2,45, -23, 85, 28, 2, 35, -12]})
    data['log_1'] = (data['latitude']+1).transform(np.log)
    #Negative Values Handling
    data['log'] = (data['latitude']-data['latitude'].min()+1) .transform(np.log)

    #Filling all missing values with 0
    data = data.fillna(0)
    #Filling missing values with medians of the columns
    data = data.fillna(data.median())
    print("Feature engineering tasks completed successfully")
    #writing to new file
    data.to_csv('./housing_revised.csv', index=False, na_rep='Unknown')
except Exception as e:
    print("Error in feature engineering: {0}".format(str(e)))

Feature engineering tasks completed successfully


In [3]:
try:
    print("Feature creating second file")
    #writing to new file
    data.to_csv('./housing_revised_V2.csv', index=False, na_rep='Unknown')
    #print data
    print(data)
except Exception as e:
    print("Error in feature engineering: {0}".format(str(e)))

Feature creating second file
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  hou