In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
from functools import lru_cache
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn import svm
import warnings
from yellowbrick.model_selection import LearningCurve
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import ValidationCurve

import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.exceptions import ConvergenceWarning

In [2]:
@lru_cache(maxsize=None)
def load_heart():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    df = pd.read_csv("data/heart.csv")
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y

In [3]:
@lru_cache(maxsize=None)
def load_mobile():
    # https://www.kaggle.com/iabhishekofficial/mobile-price-classification
    df = pd.read_csv("data/phone_price.csv")
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y

In [4]:
@lru_cache(maxsize=None)
def load_adult():
    df = pd.read_csv("data/adult.csv")
    df = pd.get_dummies(df)
    df = df.drop(df.columns[-2], axis=1)
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y

In [5]:
@lru_cache(maxsize=None)
def load_weather():
    # https://www.kaggle.com/jsphyg/weather-dataset-rattle-package?select=weatherAUS.csv
    df = pd.read_csv("data/weather.csv")
    df = df.dropna(axis=0)
    df = pd.get_dummies(df, dummy_na=False)
    # drop the nan column
    df = df.drop(df.columns[-1], axis=1)
    # drop the no column
    df = df.drop(df.columns[-2], axis=1)
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y


# Understanding the Data

In [6]:
def save_figs(dX, y, name):
    plt.matshow(X.corr())
    plt.yticks(np.arange(X.shape[1]), X.columns)
    plt.xticks(np.arange(X.shape[1]), X.columns)
    plt.colorbar()
    plt.savefig(f'{name}_correlation.png', bbox_inches='tight')
    plt.clf()
    X.hist()
    plt.savefig(f'{name}_histogram.png', bbox_inches='tight')
    plt.clf()
    y.hist()
    plt.savefig(f'{name}_y_histogram.png', bbox_inches='tight')
    plt.clf()
# save_figs(X,y,"weather")

# Decision Tree

In [7]:
def get_ideal_dt_depth(x_train, x_test, y_train, y_test, outpath):
    # function for fitting trees of various depths on the training data using cross-validation
    # Referenced via https://towardsdatascience.com/how-to-find-decision-tree-depth-via-cross-validation-2bf143f0f3d6
    def run_cross_validation_on_trees(X, y, tree_depths, cv=5, scoring='accuracy'):
        cv_scores_list = []
        cv_scores_std = []
        cv_scores_mean = []
        accuracy_scores = []
        for depth in tree_depths:
            tree_model = DecisionTreeClassifier(max_depth=depth)
            cv_scores = cross_val_score(tree_model, X, y, cv=cv, scoring=scoring)
            fitted = tree_model.fit(X, y)
            cv_scores_list.append(cv_scores)
            cv_scores_mean.append(cv_scores.mean())
            cv_scores_std.append(cv_scores.std())
            accuracy_scores.append(fitted.score(X, y))
        cv_scores_mean = np.array(cv_scores_mean)
        cv_scores_std = np.array(cv_scores_std)
        accuracy_scores = np.array(accuracy_scores)
        return cv_scores_mean, cv_scores_std, accuracy_scores

    # function for plotting cross-validation results
    def plot_cross_validation_on_trees(depths, cv_scores_mean, cv_scores_std, accuracy_scores, title):
        fig, ax = plt.subplots(1,1, figsize=(15,5))
        ax.plot(depths, cv_scores_mean, '-o', label='mean cross-validation accuracy', alpha=0.9)
        ax.fill_between(depths, cv_scores_mean-2*cv_scores_std, cv_scores_mean+2*cv_scores_std, alpha=0.2)
        ylim = plt.ylim(0.45,1.05)
        ax.plot(depths, accuracy_scores, '-*', label='train accuracy', alpha=0.9)
        ax.set_title(title, fontsize=16)
        ax.set_xlabel('Tree depth', fontsize=14)
        ax.set_ylabel('Accuracy', fontsize=14)
        ax.set_ylim(ylim)
        ax.set_xticks(depths)
        ax.legend()
        fig.savefig(outpath, bbox_inches='tight')
        fig.clf()
        return fig

    # fitting trees of depth 1 to 24
    sm_tree_depths = range(1,15)
    sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores = run_cross_validation_on_trees(x_train, y_train, sm_tree_depths)
    
    # plotting accuracy\
    fig = plot_cross_validation_on_trees(sm_tree_depths, sm_cv_scores_mean, sm_cv_scores_std, sm_accuracy_scores, 
                                   'Accuracy per decision tree depth on training data')


    # Empty array that will hold our classifiers
    classifiers = []
    dif = sm_cv_scores_mean - sm_accuracy_scores
    return dif

def dt(x_train, x_test, y_train, y_test, depth, outpath, name):
    clf = DecisionTreeClassifier(max_depth=depth)
    # Create the learning curve visualizer
    visualizer = LearningCurve(
        clf,
        scoring='accuracy'
    )
    visualizer.fit(x_train, y_train)        # Fit the data to the visualizer
#     visualizer.show(outpath=outpath, clear_figure=True)
    visualizer.finalize()
    # Get access to the axes object and modify labels
    plt.savefig(outpath, bbox_inches='tight')
    plt.clf()
    start = time.time()
    clf.fit(x_train,y_train)
    end = time.time()
    print("Training DT took" , (end - start)*1000, " ms")
    accuracy = clf.score(x_test,y_test)
    return accuracy

# MLP (Neural Network)

In [13]:
def nn(x_train, x_test, y_train, y_test, outpath, name, hidden_layers, lr):
    clf = MLPClassifier(max_iter=800, hidden_layer_sizes=hidden_layers, learning_rate_init=lr, 
                        shuffle=False, random_state=12)
    # Create the learning curve visualizer
    visualizer = LearningCurve(
        clf,
        scoring='accuracy'
    )
    visualizer.fit(x_train, y_train)
    visualizer.finalize()
    # Get access to the axes object and modify labels
    plt.savefig(outpath, bbox_inches='tight')
    plt.clf()
    start = time.time()
    clf.fit(x_train,y_train)
    end = time.time()
    print("Training NN took" , (end - start)*1000, " ms")
    accuracy = visualizer.score(x_test,y_test)
    return accuracy

# Boosting

In [17]:
def boosting(x_train, x_test, y_train, y_test, best_dt_depth, n_estimators, outpath, name):
    clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=best_dt_depth), n_estimators=n_estimators)
    # Create the learning curve visualizer
    visualizer = LearningCurve(
        clf
    )

    visualizer.fit(x_train, y_train)
    visualizer.finalize()
    # Get access to the axes object and modify labels
    plt.savefig(outpath, bbox_inches='tight')
    plt.clf()
    start = time.time()
    clf.fit(x_train,y_train)
    end = time.time()
    print("Training AdaBoost took" , (end - start)*1000, " ms")
    accuracy = clf.score(x_test,y_test)
    return accuracy

# Support Vector Machines

In [10]:
def svm(x_train, x_test, y_train, y_test, outpath):
    clf = svm.SVC()
    # Create the learning curve visualizer
    visualizer = LearningCurve(
        clf
    )

    visualizer.fit(x_train, y_train)
    visualizer.finalize()
    # Get access to the axes object and modify labels
    plt.savefig(outpath, bbox_inches='tight')
    plt.clf()
    start = time.time()
    clf.fit(x_train,y_train)
    end = time.time()
    print("Training SVM took" , (end - start)*1000, " ms")
    accuracy = clf.score(x_test,y_test)
    return accuracy

# K Nearest Neighbors

In [11]:
def knn(x_train, x_test, y_train, y_test):
    clf = KNeighborsClassifier(3)
    # Create the learning curve visualizer
    visualizer = LearningCurve(
        clf
    )

    visualizer.fit(x_train, y_train)        # Fit the data to the visualizer
   # visualizer.show()           # Finalize and render the figure

    clf.fit(x_train,y_train)
    accuracy = clf.score(x_test,y_test)

    
    return visualizer, accuracy

# Generate Train and Test Data

In [None]:
# X, y = load_heart()
X, y = load_mobile()
# X, y = load_weather()
# X, y = load_adult()
# dt nn boosting svm knn
data = [load_mobile(), load_heart()]
names = ['mobile', 'heart']
for i in range(2):
    x_train, x_test, y_train, y_test = train_test_split(data[i][0],data[i][1], test_size=0.1, random_state=12)
    name=names[i]
    print('=============================')
    print(f"Generating Data for {name}")
    print('=============================')
    dif = get_ideal_dt_depth(x_train, x_test, y_train, y_test, f'out/dt_pruning_{name}')
    ideal_depth = np.argsort(dif)[-2]
    accuracy = dt(x_train, x_test, y_train, y_test, ideal_depth, f'out/dt_{name}.png',name)
    print("DT accuracy: {0:.3f}%".format(accuracy))
    
#     accuracy = nn(x_train, x_test, y_train, y_test, f'out/nn_{name}.png', name, hidden_layers=(64,12), lr=0.001)
#     print("NN accuracy: {0:.3f}%".format(accuracy))
#     accuracy = nn(x_train, x_test, y_train, y_test, f'out/nn_big_{name}.png', name, (1000,), 0.003)
#     print("NN With More Layers accuracy: {0:.3f}%".format(accuracy))
    
    accuracy = boosting(x_train, x_test, y_train, y_test, ideal_depth, 50, f'out/boosting_{name}.png', name)
    print("Boosting accuracy: {0:.3f}%".format(accuracy))
    accuracy = boosting(x_train, x_test, y_train, y_test, ideal_depth, 500, f'out/boosting_more_estimators_{name}.png', name)
    print("Boosting More Estimators accuracy: {0:.3f}%".format(accuracy))
    
    
    accuracy = svm(x_train, x_test, y_train, y_test, f'out/boosting_more_estimators_{name}.png')
    print("SVM accuracy: {0:.3f}%".format(accuracy))




Generating Data for mobile
Training DT took 3.2320022583007812  ms
DT accuracy: 0.725%
Training AdaBoost took 156.4490795135498  ms
Boosting accuracy: 0.745%
