# Introduction
> This is the workbook that we will submit.
> All relevant visuals and data analysis for our report should be produced by executing this workbook's cells in order. 

In [None]:
'''Install non-standard libraries'''
!pip install skopt

In [None]:
'''Generic Imports'''
import numpy as np
from scipy import stats

'''Visualization Imports'''
from prettytable import PrettyTable
import seaborn as sns
import matplotlib.pyplot as plt

'''Import Data'''
from sklearn.datasets import load_digits, fetch_california_housing

'''Import Data Processing Utilities'''
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.decomposition import PCA

'''Import Predictors'''
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

'''Import Model Tuning Utilities'''
from skopt import BayesSearchCV


In [None]:
'''Define Global Variables'''


In [None]:
'''Import Classifier data'''
digits = load_digits()


In [None]:
'''Data Visualization and EDA'''


In [None]:
'''Instantiate and Evaluate Default Classifiers'''
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)
for model in [DecisionTreeClassifier, RandomForestClassifier]:
    pipe = Pipeline([
        ('scale', StandardScaler()),
        # ('pca', PCA()), PCA Unused due to negative impact on model performance
        ('classify',model())],
    verbose=True)
    predicted = pipe.fit(X_train, y_train).predict(X_test)
    print(classification_report(y_test, predicted))
    sns.heatmap(confusion_matrix(y_test, predicted))
    plt.title('Confusion Matrix')
    plt.show()



In [None]:
'''Instantiate and Tune Classifiers'''

classifiers = {
    DecisionTreeClassifier.__name__: {
        'model': DecisionTreeClassifier,
        'paramSpace': {
            'classify__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'classify__splitter': Categorical(['best', 'random']),
            'classify__max_depth': Integer(1, 1000),
            'classify__min_samples_split': Real(0.01, 0.9),
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__min_weight_fraction_leaf': Real(0.0,0.5),
            'classify__max_features': Real(0.01,0.9),
            'classify__max_leaf_nodes': Integer(2, 4000), 
            'classify__min_impurity_decrease': Real(0.0, 1.0),
            'classify__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1,len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    },
    RandomForestClassifier.__name__: {
        'model': RandomForestClassifier,
        'paramSpace': {
            'classify__n_estimators': Integer(10, 2000),
            'classify__criterion': Categorical(['gini', 'entropy', 'log_loss']),
            'classify__max_depth': Integer(1, 1000), 
            'classify__min_samples_split': Real(0.01, 0.9), 
            'classify__min_samples_leaf': Real(0.01, 0.9),
            'classify__min_weight_fraction_leaf': Real(0.01,0.5),
            'classify__max_features': Real(0.01,0.9),
            'classify__max_leaf_nodes': Integer(1,2000),
            'classify__min_impurity_decrease': Real(0.01,0.9),
            # 'classify__bootstrap': Categorical([True, False]),
            'classify__oob_score': Categorical([True, False]),
            'classify__warm_start': Categorical([True, False]),
            'classify__max_samples':Real(0.01,0.9),
            'pca__n_components': Integer(1,len(digits.data[0])),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    }
}

tunedModels = {}

X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3, random_state=42)
for name, classDict in classifiers.items():
    
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classify',classDict['model']())],
    verbose=False)

    tunedModels[name] = BayesSearchCV(
        pipe,
        classDict['paramSpace'],
        n_iter= 5,#30, # Reduced for performance during development
        cv= 4,#20, # Reduced for performance during development
        scoring='accuracy',
        # TODO: Use GridSearch for scoring criteria
        # NOTE: This will take 13 hours. Execute over night.
        random_state=42
        )
    tunedModels[name].fit(X_train, y_train)
    
    # search = BayesSearchCV(pipe, param_grid, n_jobs=2)
    # predicted = search.best_estimator_.predict(X_test)
    predicted = tunedModels[name].best_estimator_.predict(X_test)
    
    print(classification_report(y_test, predicted))
    sns.heatmap(confusion_matrix(y_test, predicted))
    plt.title(name + ' Confusion Matrix')
    plt.show()




In [None]:
'''Evaluate tuning process and resultant models'''


In [None]:
'''Import Regression Data'''
cal_housing = fetch_california_housing(as_frame=True)



In [None]:
'''Initial data EDA'''

outlierTable = PrettyTable(['Feature', 'Outlier Count'])

for column in cal_housing.data.columns:
    Q1 = cal_housing.data[column].quantile(0.25)
    Q3 = cal_housing.data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    outlierCount = np.array(cal_housing.data[column] >= upper).sum() + np.array(cal_housing.data[column] <= lower).sum()
    outlierTable.add_row([column, outlierCount])
    # print(f"{column}: {outlierCount}")

print(outlierTable)


In [None]:
'''Data Cleaning and Pre-Processing'''
cleanData = cal_housing.data
cleanData['y'] = cal_housing.target

cleanData = cleanData.drop(columns=['Longitude', 'Latitude'])
for feature in ['AveBedrms', 'AveRooms', 'AveOccup', 'Population']:
    cleanData = cleanData[(np.abs(stats.zscore(cleanData[feature])) < 2.5)]
cleanTarget = cleanData['y'].to_list()

# DATA CLEANING TODOs
# TODO: Bin Lat/Long groupings into city/town clusters. look for available geo-fencing data for cluster labeling - can we do a graph of centroids on top of map?
# TODO: Fix Skew for Population, MedIncome, AvgOccup, AvgBedroom, Target
# TODO: Feature Engineering / Reduction
cleanData.drop(columns=['y'], inplace=True)

In [None]:
'''Instantiate and Evaluate Default Regressors'''
modelData = cleanData.copy()

# TODO: How does normalization vs standardization impact model performance
transformPipeline = [
    ('scaler', StandardScaler()),
    # ('feature_reduction', PCA(n_components=5,iterated_power=7))
    ]

X_train, X_test, y_train, y_test = train_test_split(modelData, cleanTarget, test_size=0.3, random_state=42)
for regressor in [DecisionTreeRegressor, RandomForestRegressor]:
    pipe = Pipeline(transformPipeline + [('regress',regressor())], verbose=True)
    predicted = pipe.fit(X_train, y_train).predict(X_test)
    mse = mean_squared_error(y_test, predicted)
    print(regressor.__name__)
    print(mse)
    
    
    # TODO: Graph Regression Plane using skopt.plots
    # NOTE: try using PCA to force data into 3d space
    

In [None]:
'''Instantiate and Tune Regressors'''
# TODO: Explore how increased demetionality in the parameter space impacts optimization performance
regressors = {
    DecisionTreeRegressor.__name__: {
        'model': DecisionTreeRegressor,
        'paramSpace': {
            'regress__criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'regress__splitter': ['best', 'random'],
            'regress__max_depth': Integer(2, 1000),
            'regress__min_samples_split': Real(0.01, 0.9),
            'regress__min_samples_leaf': Real(0.01, 0.9),
            'regress__min_weight_fraction_leaf': Real(0.0, 0.5),
            'regress__max_features': Real(0.01, 0.5),
            'regress__max_leaf_nodes': Integer(2, 1000),
            'regress__min_impurity_decrease': Real(0.0, 0.9),
            'regress__ccp_alpha': Real(0.01, 0.9),
            'pca__n_components': Integer(1,len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    },
    RandomForestRegressor.__name__: {
        'model': RandomForestRegressor,
        'paramSpace': {
            'regress__n_estimators': Integer(50, 500),
            'regress__criterion': Categorical(['squared_error', 'friedman_mse', 'absolute_error', 'poisson']),
            'regress__max_depth': Integer(2, 1000), 
            'regress__min_samples_split': Real(0.01, 0.9),
            'regress__min_samples_leaf': Real(0.01, 0.9),
            'regress__min_weight_fraction_leaf': Real(0.01, 0.5),
            'regress__max_features': Real(0.01,0.9),
            # 'regress__max_features': Categorical(['sqrt', 'log2']), 
            'regress__max_leaf_nodes': Integer(2,1000),
            'regress__min_impurity_decrease': Real(0.01, 0.9),
            # 'regress__bootstrap': [True, False],
            'regress__oob_score': [True, False],
            'regress__warm_start': [True, False],
            'regress__ccp_alpha': Real(0.01, 0.9),
            'regress__max_samples': Real(0.01, 0.9),
            'pca__n_components': Integer(1,len(modelData.columns)),
            'scaler__with_mean': [True, False],
            'scaler__with_std': [True, False],
        }
    }
}

tunedModel = {}
X_train, X_test, y_train, y_test = train_test_split(modelData, cleanTarget, test_size=0.3, random_state=42)

for name, regDict in regressors.items():
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('regress',regDict['model']())])
    tunedModel[name] = BayesSearchCV(
        pipe,
        regDict['paramSpace'],
        n_iter=5,#30,
        cv=5,#20,
        # scoring=scoreModel
        # scoring = scoringCriteria[i]
        # TODO: Use GridSearchCV for scoringCriteria param space
        # NOTE: This will take 13 hours. Execute over night.
        )
    tunedModel[name].fit(X_train, y_train)
    predicted = tunedModel[name].best_estimator_.predict(X_test)
    print('Evaluation Metric:', tunedModel[name].get_params()['scoring'])
    print("val. score: %s" % tunedModel[name].best_score_)
    print("test score: %s" % tunedModel[name].score(X_test, y_test))
    

In [None]:
'''Evaluate tuning process and resultant models'''

