# Modeling the Data

## Importing libraries

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from tqdm import tqdm_notebook as tqdm
import _pickle as pickle

## Loading in the DataFrame

In [65]:
with open("main_df.pickle",'rb') as fp:
    main_df = pickle.load(fp)

## Modeling - Decision Tree

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, roc_curve, auc

In [68]:
# Assigning appropriate variables for modeling
X = main_df.drop('price_will_increase?', axis=1)
y = main_df['price_will_increase?']

In [69]:
# Splitting the data 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

### Instantiating and fitting the Decision Tree model

In [70]:
classifier = DecisionTreeClassifier(random_state=10)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

In [71]:
# Predictions
y_pred = classifier.predict(X_test)

### Accuracy of the Decision Tree model

In [73]:
acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy is:", acc)

Accuracy is: 56.93756194251735


### Area under the Curve 

In [75]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

AUC: 0.5451


### Confusion Matrix

In [78]:
print("Confusion Matrix \n-----------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Confusion Matrix 
-----------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,680,845,1525
True,893,1618,2511
All,1573,2463,4036


### Improving Decision Tree using GridsearchCV

In [79]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [82]:
# Using Cross Validation
dt_clf = DecisionTreeClassifier()
dt_cv_score = cross_val_score(dt_clf, X, y, cv=3)
mean_dt_cv_score = np.mean(dt_cv_score)

print("Mean Cross Validation Score:", mean_dt_cv_score*100)

Mean Cross Validation Score: 56.46811629287446


In [83]:
dt_param_grid = {'criterion': ['gini', 'entropy'],
                 'max_depth': [None, 2, 3, 4, 5, 6],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1,2,3,4,5,6]}

#### Using GridSearchCV
Finding the best parameters for the DecisionTree Model.

In [84]:
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=3, return_train_score=True)
dt_grid_search.fit(X, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'criterion': ['gini', 'entropy'], 'max_depth': [None, 2, 3, 4, 5, 6], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [85]:
dt_gs_training_score = np.mean(dt_grid_search.cv_results_['mean_train_score'])
dt_gs_testing_score = dt_grid_search.score(X, y)

print("Mean Training Score:", dt_gs_training_score*100)
print("Mean Testing Score:", dt_gs_testing_score*100)
print("Best Parameter Found:")
dt_grid_search.best_params_

Mean Training Score: 69.9054447761618
Mean Testing Score: 65.0327121332276
Best Parameter Found:


{'criterion': 'entropy',
 'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

### Running the Decision Tree model with the best parameters according to GridSearchCV

In [91]:
classifier_gs = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=1, min_samples_split=2)
classifier_gs.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [92]:
# Predictions with GridSearch Params
y_pred_gs = classifier_gs.predict(X_test)

In [93]:
# Accuracy with GridSearch Params
acc_gs = accuracy_score(y_test, y_pred_gs) * 100
print("Accuracy is:", acc_gs)

Accuracy is: 65.48562933597621


In [94]:
# Area under the Curve with GS Params
false_positive_rate_gs, true_positive_rate_gs, thresholds = roc_curve(y_test, y_pred_gs)
roc_auc_gs = auc(false_positive_rate_gs, true_positive_rate_gs)
print("AUC:", round(roc_auc_gs, 4))

AUC: 0.5706


In [95]:
# Confusion Matrix from Gridsearch Params
print("Confusion Matrix \n-----------------")
pd.crosstab(y_test, y_pred_gs, rownames=['Actual'], colnames=['Predicted'], margins=True)

Confusion Matrix 
-----------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,344,1181,1525
True,212,2299,2511
All,556,3480,4036
