In [4]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.cross_validation import ShuffleSplit

# Import supplementary visualizations code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

# Load the Boston housing dataset
data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
    
# Success
print("Boston housing dataset has {} data points with {} variables each.".format(*data.shape))
print(prices.head())
print(features.head())

Boston housing dataset has 489 data points with 4 variables each.
0    504000.0
1    453600.0
2    728700.0
3    701400.0
4    760200.0
Name: MEDV, dtype: float64
      RM  LSTAT  PTRATIO
0  6.575   4.98     15.3
1  6.421   9.14     17.8
2  7.185   4.03     17.8
3  6.998   2.94     18.7
4  7.147   5.33     18.7


In [5]:
# TODO: Import 'r2_score'
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict) 
    
    # Return the score
    return score

In [6]:
# Calculate the performance of this model
score = performance_metric([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Model has a coefficient of determination, R^2, of 0.923.


In [7]:
# TODO: Import 'train_test_split'
from sklearn.model_selection import train_test_split

# TODO: Shuffle and split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state=42)

# Success
print("Training and testing split was successful.")

Training and testing split was successful.


# My using of Linear Regression on the boston housing price

In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $374,221.68
Predicted selling price for Client 2's home: $-7,680.95
Predicted selling price for Client 3's home: $842,850.45
Model has a coefficient of determination, R^2, of 0.691.


# My using of Decision Tree Regression on boston housing price


In [15]:
# TODO: Import 'make_scorer', 'DecisionTreeRegressor', and 'GridSearchCV'
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV


model = DecisionTreeRegressor(max_depth = 4)

model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $403,025.00
Predicted selling price for Client 2's home: $237,478.72
Predicted selling price for Client 3's home: $931,636.36
Model has a coefficient of determination, R^2, of 0.844.


In [16]:
# my use of MLP Regressor on boston housing price

In [37]:
from sklearn.neural_network import MLPRegressor


model = MLPRegressor(activation= 'identity')


model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $1,440.91
Predicted selling price for Client 2's home: $2,228.92
Predicted selling price for Client 3's home: $923.82
Model has a coefficient of determination, R^2, of -8.154.




# my use of svr on boston housing

In [32]:
from sklearn.svm import SVR


#model = SVR(C=1.0, epsilon=0.2)
model = SVR(C=10, degree=3, epsilon=0.1, gamma='auto', kernel='poly')


model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $350,537.05
Predicted selling price for Client 2's home: $345,455.44
Predicted selling price for Client 3's home: $934,971.97
Model has a coefficient of determination, R^2, of 0.803.


* The R2 score is the proportion of the variance in the dependent variable that is predictable from the independent variable. In other words:
* R2 score of 0 means that the dependent variable cannot be predicted from the independent variable.
* R2 score of 1 means the dependent variable can be predicted from the independent variable.
* R2 score between 0 and 1 indicates the extent to which the dependent variable is predictable.
* R2 score of 0.40 means that 40 percent of the variance in Y is predictable from X.

* The values for R2 range from 0 to 1, which captures the percentage of squared correlation between the predicted and actual values of the target variable. A model with an R2 of 0 is no better than a model that always predicts the mean of the target variable, whereas a model with an R2 of 1 perfectly predicts the target variable. Any value between 0 and 1 indicates what percentage of the target variable, using this model, can be explained by the features. A model can be given a negative R2 as well, which indicates that the model is arbitrarily worse than one that always predicts the mean of the target variable.

# Grid Search on each algorithm

In [31]:
from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
#scorer = make_scorer(f1_score)

# TODO: Create a decision tree regressor object
regressor = DecisionTreeRegressor()

# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
params = {'max_depth': list(range(1,11))}


# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
scoring_fnc = make_scorer(performance_metric)

# Create the object.
#grid_obj = GridSearchCV(regressor, parameters, scoring=scorer)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring = scoring_fnc, cv = cv_sets)

# Fit the data
grid_fit = grid.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

# Produce the value for 'max_depth'
#print("Parameter 'max_depth' is {} for the optimal model.".format(best_clf.get_params()['max_depth']))
print("Parameter is {} for the optimal model.".format(best_clf.get_params()))

Parameter is {'criterion': 'mse', 'max_depth': 4, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'} for the optimal model.


C : float, optional (default=1.0)
Penalty parameter C of the error term.
epsilon : float, optional (default=0.1)
Epsilon in the epsilon-SVR model. It specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.
kernel : string, optional (default=’rbf’)
Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to precompute the kernel matrix.
degree : int, optional (default=3)
Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
gamma : float, optional (default=’auto’)
Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. If gamma is ‘auto’ then 1/n_features will be used instead.

# grid search for SVR()

In [30]:
from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
#scorer = make_scorer(f1_score)

# TODO: Create a decision tree regressor object
regressor = SVR()

# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
#params = {'max_depth': list(range(1,11))}
params = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
scoring_fnc = make_scorer(performance_metric)

# Create the object.
#grid_obj = GridSearchCV(regressor, parameters, scoring=scorer)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring = scoring_fnc, cv = cv_sets)

# Fit the data
grid_fit = grid.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

# Produce the value for 'max_depth'
print("Parameter  is {} for the optimal model.".format(best_clf.get_params()))

Parameter  is {'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'auto', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False} for the optimal model.


# grid search for MLPregressor()

In [36]:
from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
#scorer = make_scorer(f1_score)

# TODO: Create a decision tree regressor object
regressor = MLPRegressor()

# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
#params = {'max_depth': list(range(1,11))}
#params = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}
params = {'activation':['identity', 'logistic', 'tanh', 'relu']}
#          , 
#          'solver':['lbfgs', 'sgd', 'adam'], 'alpha' : [0.0001, 0.001, 0.01]

# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
scoring_fnc = make_scorer(performance_metric)

# Create the object.
#grid_obj = GridSearchCV(regressor, parameters, scoring=scorer)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring = scoring_fnc, cv = cv_sets)

# Fit the data
grid_fit = grid.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

# Produce the value for 'max_depth'
print("Parameter  is {} for the optimal model.".format(best_clf.get_params()))





Parameter  is {'activation': 'identity', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False} for the optimal model.




# using random forest

In [43]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=18)
model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $320,833.33
Predicted selling price for Client 2's home: $238,116.67
Predicted selling price for Client 3's home: $881,533.33
Model has a coefficient of determination, R^2, of 0.853.


# grid search in Random Forest

In [40]:
from sklearn.model_selection import GridSearchCV

#parameters = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}

from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
#scorer = make_scorer(f1_score)

# TODO: Create a decision tree regressor object
regressor = RandomForestRegressor()

# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
#params = {'max_depth': list(range(1,11))}
#params = {'kernel':['poly', 'rbf'],'C':[0.1, 1, 10]}
params = {'n_estimators':list(range(1,20))}
#          , 
#          'solver':['lbfgs', 'sgd', 'adam'], 'alpha' : [0.0001, 0.001, 0.01]

# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
scoring_fnc = make_scorer(performance_metric)

# Create the object.
#grid_obj = GridSearchCV(regressor, parameters, scoring=scorer)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring = scoring_fnc, cv = cv_sets)

# Fit the data
grid_fit = grid.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

# Produce the value for 'max_depth'
print("Parameter  is {} for the optimal model.".format(best_clf.get_params()))

Parameter  is {'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 18, 'n_jobs': 1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} for the optimal model.


In [51]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $397,558.47
Predicted selling price for Client 2's home: $249,029.73
Predicted selling price for Client 3's home: $897,031.58
Model has a coefficient of determination, R^2, of 0.837.


# try Gradient Booster

In [52]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

client_data = [[5, 17, 15], # Client 1
               [4, 32, 22], # Client 2
               [8, 3, 12]]  # Client 3

# Show predictions
for i, price in enumerate(model.predict(client_data)):
    print("Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price))
    
    
# Calculate the performance of this model
score = performance_metric(y_test, model.predict(X_test))
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))

Predicted selling price for Client 1's home: $315,015.11
Predicted selling price for Client 2's home: $219,425.49
Predicted selling price for Client 3's home: $843,322.77
Model has a coefficient of determination, R^2, of 0.847.
