In [453]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [454]:
PATH = "housing.csv"
#load data and read into dataframe
califonia_data = pd.read_csv(PATH)

#drop down NaN and display dataframe
#califonia_data.describe()

In [455]:
califonia_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [456]:
#setting prediction target(house value)
y = califonia_data.median_house_value

In [457]:
#features to be considered for prediction
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'total_bedrooms', 'ocean_proximity',]

#setting Features
X = califonia_data[features]
X.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,population,households,median_income,total_bedrooms
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20433.0
mean,-119.569704,35.631861,28.639486,2635.763081,1425.476744,499.53968,3.870671,537.870553
std,2.003532,2.135952,12.585558,2181.615252,1132.462122,382.329753,1.899822,421.38507
min,-124.35,32.54,1.0,2.0,3.0,1.0,0.4999,1.0
25%,-121.8,33.93,18.0,1447.75,787.0,280.0,2.5634,296.0
50%,-118.49,34.26,29.0,2127.0,1166.0,409.0,3.5348,435.0
75%,-118.01,37.71,37.0,3148.0,1725.0,605.0,4.74325,647.0
max,-114.31,41.95,52.0,39320.0,35682.0,6082.0,15.0001,6445.0


In [458]:
# split data to get training and validation data
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

#train_X.shape

In [459]:
# define function for prediting and evaluating our dataset
def score_all(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds)
    return mae

In [460]:
# columns with categorical variables 
s = (train_X.dtypes == 'object')
print(s)
object_cols = list(s[s].index)
print("categorical variables")
print(object_cols)

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
population            False
households            False
median_income         False
total_bedrooms        False
ocean_proximity        True
dtype: bool
categorical variables
['ocean_proximity']


In [461]:
# first approach is to drop thes columns 
train_X_dropped = train_X.select_dtypes(exclude=['object'])
val_X_dropped = val_X.select_dtypes(exclude=['object'])

In [462]:
# second approach is to ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

#make copy to avoid changing original data
label_X_train = train_X.copy()
label_X_val = val_X.copy()

#apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()

label_X_train[object_cols] = ordinal_encoder.fit_transform(train_X[object_cols])
label_X_val[object_cols] = ordinal_encoder.fit_transform(val_X[object_cols])

In [463]:
# 3rd approach for handling categorical variables
from sklearn.preprocessing import OneHotEncoder

#apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown= 'ignore', sparse= False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X[object_cols]))
OH_cols_val = pd.DataFrame(OH_encoder.fit_transform(val_X[object_cols]))

#one hot encoding removed index. put it back
OH_cols_train.index = train_X.index
OH_cols_val.index = val_X.index

#removed categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop(object_cols, axis=1)
num_val_X = val_X.drop(object_cols, axis=1)

# add one - hot encoded columns to numerical features
OH_X_train = pd.concat([num_train_X, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_val_X, OH_cols_val], axis=1)

In [464]:
# First (1st) approach to handlling missing data, get columns with missing data
col_with_missing = [col for col in label_X_train.columns
                   if label_X_train[col].isnull().any()]

train_X_reduced = label_X_train.drop(col_with_missing, axis=1)
val_X_reduced = label_X_val.drop(col_with_missing, axis=1)

print ("MAE from dropping columns")
score_all(train_X_reduced, val_X_reduced, train_y, val_y)

MAE from dropping columns


32195.37646511628

In [465]:
# second approach to handling missing data
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer = SimpleImputer() # Your code here
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(label_X_train))
imputed_val_X = pd.DataFrame(my_imputer.transform(label_X_val))

# Fill in the lines below: imputation removed column names; put them back
imputed_train_X.columns = label_X_train.columns
imputed_val_X.columns = label_X_val.columns

print("MAE from 2nd approach ")
score_all(imputed_train_X, imputed_val_X, train_y, val_y)

MAE from 2nd approach 


32406.613267441862

In [466]:
#thrid (3rd) approach to hadling missing data

# making copy of the data to avoid changing originall data
train_X_plus = OH_X_train.copy()
val_X_plus  = OH_X_valid.copy()

#find columns with missing data
col_with_missing = [col for col in OH_X_train.columns
                   if OH_X_train[col].isnull().any()]

#looping through the missing columns to add extra information
for col in col_with_missing:
    train_X_plus[col + '__was missing'] = train_X_plus[col].isnull()
    val_X_plus[col + '__was missing'] = val_X_plus[col].isnull()
    
#imputer
my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.fit_transform(val_X_plus))

#fix column names
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns

print("MAE from 3rd approach ")
print(score_all(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))



MAE from 3rd approach 
31781.240424418604




In [467]:
#define the model with a random state equals 1
califonia_model = DecisionTreeRegressor(random_state=1)

#fit data and #make predictions
califonia_model.fit(imputed_train_X, train_y)
preds = califonia_model.predict(imputed_val_X)

def scoreall(val_y,preds):
    mae = mean_absolute_error(val_y,preds)
    return mae

scoreall(val_y, preds)

42881.26162790698

In [468]:
# making a better predictions with RandomForestRegressor and make predictions

califonia_model_2 = RandomForestRegressor(random_state=1)
califonia_model_2.fit(imputed_train_X, train_y)
preds_2 = califonia_model_2.predict(imputed_val_X)


#measuring the quality of the data

def scoreall(val_y,preds_2 ):
    mae = mean_absolute_error(val_y,preds_2)
    return mae

scoreall(val_y, preds_2)

32406.613267441862