In [234]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [235]:
PATH = "housing.csv"
#load data and read into dataframe
califonia_data = pd.read_csv(PATH)

#drop down NaN and display dataframe
califonia_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [236]:
califonia_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [237]:
#setting prediction target(house value)
y = califonia_data.median_house_value
y.describe()

count     20640.000000
mean     206855.816909
std      115395.615874
min       14999.000000
25%      119600.000000
50%      179700.000000
75%      264725.000000
max      500001.000000
Name: median_house_value, dtype: float64

In [238]:
#features to be considered for prediction
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income']

#setting Features
X = califonia_data[features]

In [239]:
# split data to get training and validation data
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

In [240]:
# define function for prediting and evaluating our dataset

def score_all(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds)
    return mae

In [241]:
# First (1st) approach to handlling missing data, get columns with missing data
col_with_missing = [col for col in train_X.columns
                   if train_X[col].isnull().any()]

for col in col_with_missing:
    print(col)

train_X_reduced = train_X.drop(col_with_missing, axis=1)
val_X_reduced = val_X.drop(col_with_missing, axis=1)

print ("MAE from dropping columns")
print(score_all(train_X_reduced, val_X_reduced, train_y, val_y))



MAE from dropping columns
32070.773844961244


In [242]:
# second (2nd) approach for handling missing data by simple imputations
my_imputer = SimpleImputer()

imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.fit_transform(val_X))

#replace column names
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns 

print("MAE from 2nd approach ")
print(score_all(imputed_train_X, imputed_val_X, train_y, val_y))


MAE from 2nd approach 
32070.773844961244


In [243]:
#thrid (3rd) approach to hadling missing data

# making copy of the data to avoid changing originall data
train_X_plus = train_X.copy()
val_X_plus  = val_X.copy()

#find columns with missing data
col_with_missing = [col for col in train_X.columns
                   if train_X[col].isnull().any()]

#looping through the missing columns to add extra information
for col in col_with_missing:
    train_X_plus[col + '__was missing'] = train_X_plus[col].isnull()
    val_X_plus[col + '__was missing'] = val_X_plus[col].isnull()
    
#imputer
my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.fit_transform(val_X_plus))

#fix column names
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns

print("MAE from 3rd approach ")
print(score_all(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))

MAE from 3rd approach 
32070.773844961244


In [244]:
#define the model with a random state equals 1
califonia_model = DecisionTreeRegressor(random_state=1)

#fit data and #make predictions
califonia_model.fit(train_X, train_y)
preds = califonia_model.predict(val_X)

def scoreall(val_y,preds):
    mae = mean_absolute_error(val_y,preds)
    return mae

scoreall(val_y, preds)

43273.4742248062

In [245]:
# making a better predictions with RandomForestRegressor and make predictions

califonia_model_2 = RandomForestRegressor(random_state=1)
califonia_model_2.fit(train_X , train_y)
preds_2 = califonia_model_2.predict(val_X)


#measuring the quality of the data

def scoreall(val_y,preds_2 ):
    mae = mean_absolute_error(val_y,preds_2)
    return mae

scoreall(val_y, preds_2)

32070.773844961244