<a href="https://colab.research.google.com/github/lolikgiovi/MachineLearning_Project/blob/master/Regression_Basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Exploration using Pandas

In [15]:
import pandas as pd

#save filepath as variable
filepath = '/content/sample_data/california_housing_train.csv'

#load data and save into dataframe
data = pd.read_csv(filepath)

#print data summary
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Data Preprocessing

In [16]:
#drop missing values, axis = 0 -> delete row containing missing data
data = data.dropna(axis = 0)
data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [17]:
#indexing column names
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [18]:
#select a column as the Prediction Target (y)
y = data.median_house_value

#choose features (can be all, can be few), based on column names
data_feature = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                'total_bedrooms', 'population', 'households', 'median_income',]

#separate the selected data (X)
X = data[data_feature] 

#check selected data
X.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925


## Build the Model


*  Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.
*  Fit: Capture patterns from provided data. This is the heart of modeling.
*  Predict: Just what it sounds like
*  Evaluate: Determine how accurate the model's predictions are.

### Using Decision Tree

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#split data into Train and Val data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

### Using Decision Tree
#define model
dec_tree_model = DecisionTreeRegressor()

#build model
dec_tree_model.fit(train_X, train_y)

#test predictions on validation data
dec_tree_predictions = dec_tree_model.predict(val_X)

#calculate Mean Absolute Error
print("DT MAE:", mean_absolute_error(val_y, dec_tree_predictions))

### Using Random Forest
#define model
forest_model = RandomForestRegressor()

#build model
forest_model.fit(train_X, train_y)

#test predictions on validation data
forest_model_predictions = forest_model.predict(val_X)

#calculate Mean Absolute Error
print("RF MAE:", mean_absolute_error(val_y, forest_model_predictions))

DT MAE: 44804.79694117647
RF MAE: 31763.62321647059


In [21]:
### Optimizing Decision Tree by Max Leaf Nodes

from re import M
from sklearn.metrics import mean_absolute_error

#function to get MAE from max_leaf_nodes variations
def dt_get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
min_mae = 99999999
for max_leaf_nodes in [5, 50, 500, 5000, 7000, 9000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))
  
    if my_mae < min_mae:
      min_mae = my_mae
      ideal_nodes = max_leaf_nodes

print("\nDT Lowest MAE:  %d \t\t Max leaf nodes: %d" % (min_mae, ideal_nodes))

#improve model
model = DecisionTreeRegressor(max_leaf_nodes=ideal_nodes, random_state=0)
model.fit(train_X, train_y)
preds_val = model.predict(val_X)
mae = mean_absolute_error(val_y, preds_val)
print("MAE:", mae)

Max leaf nodes: 5  		 Mean Absolute Error:  61931
Max leaf nodes: 50  		 Mean Absolute Error:  42862
Max leaf nodes: 500  		 Mean Absolute Error:  33861
Max leaf nodes: 5000  		 Mean Absolute Error:  31891
Max leaf nodes: 7000  		 Mean Absolute Error:  31891
Max leaf nodes: 9000  		 Mean Absolute Error:  31892

DT Lowest MAE:  31891 		 Max leaf nodes: 5000
MAE: 44298.63452240474
