In [485]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [486]:
PATH = "housing.csv"
#load data and read into dataframe
califonia_data = pd.read_csv(PATH)

#drop down NaN and display dataframe
#califonia_data.describe()

In [487]:
califonia_data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [488]:
#setting prediction target(house value)
y = califonia_data.median_house_value

In [489]:
#features to be considered for prediction
features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'total_bedrooms', 'ocean_proximity',]

#setting Features
X = califonia_data[features]


In [490]:
# split data to get training and validation data
train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)

#train_X.shape

In [491]:
# define function for prediting and evaluating our dataset
def score_all(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    mae = mean_absolute_error(val_y,preds)
    return mae

In [492]:
# columns with categorical variables 
s = (train_X.dtypes == 'object')
object_cols = list(s[s].index)

print("categorical variables")
print(object_cols)

categorical variables
['ocean_proximity']


In [494]:
# second approach is to ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

#make copy to avoid changing original data
label_X_train = train_X.copy()
label_X_val = val_X.copy()

#apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()

label_X_train[object_cols] = ordinal_encoder.fit_transform(train_X[object_cols])
label_X_val[object_cols] = ordinal_encoder.fit_transform(val_X[object_cols])

In [495]:
# second approach to handling missing data
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer = SimpleImputer() # Your code here
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(label_X_train))
imputed_val_X = pd.DataFrame(my_imputer.transform(label_X_val))

# Fill in the lines below: imputation removed column names; put them back
imputed_train_X.columns = label_X_train.columns
imputed_val_X.columns = label_X_val.columns

print("MAE from 2nd approach ")
score_all(imputed_train_X, imputed_val_X, train_y, val_y)

MAE from 2nd approach 


32406.613267441862

In [496]:
#define the model with a random state equals 1
califonia_model = DecisionTreeRegressor(random_state=1)

#fit data and #make predictions
califonia_model.fit(imputed_train_X, train_y)
preds = califonia_model.predict(imputed_val_X)

def scoreall(val_y,preds):
    mae = mean_absolute_error(val_y,preds)
    return mae

scoreall(val_y, preds)

42881.26162790698

In [497]:
# making a better predictions with RandomForestRegressor and make predictions

califonia_model_2 = RandomForestRegressor(random_state=1)
califonia_model_2.fit(imputed_train_X, train_y)
preds_2 = califonia_model_2.predict(imputed_val_X)


#measuring the quality of the data

def scoreall(val_y,preds_2 ):
    mae = mean_absolute_error(val_y,preds_2)
    return mae

scoreall(val_y, preds_2)

32406.613267441862

In [507]:
# using pipelines to simplify preprocessing and modelling
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#define numerical and categorical columns
numerical_cols = [col for col in train_X.columns
                if train_X[col].dtype in ['int64', 'float64']]

categorical_cols = [col for col in train_X.columns
                 if train_X[col].dtype == 'object' and
                 if train_X[col].nunique() < 10]

# preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

#binding the numerical and categorical preprocessing 
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, numerical_cols),
                  ('cat', categorical_transformer, categorical_cols)])

#define the model
model = RandomForestRegressor(n_estimators=100, random_state=1)

# create pipeline for preprocessing and modelling
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

#fit data and make predictions
my_pipeline.fit(train_X, train_y)

#preprocess of validation of validation data and get prediction

preds = my_pipeline.predict(val_X)

#evaluate the model
score = mean_absolute_error(val_y, preds)
print('MAE:', score)

MAE: 31822.118939922486
