In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
from functools import lru_cache
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn import svm

In [2]:
@lru_cache(maxsize=None)
def load_bank():
    # https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
    df=pd.read_csv('data/bank.csv', sep=';',header=0)
    # Basic data cleanup to get rid of bad values and turn Y into 0-1 val
    df=df.dropna(axis=1, how='all')
    df=df.dropna(axis=0, how='any')
    cat_cols=[]
    i = 0 
    for eachcol in df.dtypes:
        if eachcol.name=="object":
            cat_cols.append(df.columns[i])
        i=i+1
    # Convert the string values into integers, and give each value its own column, hot encode
    df=pd.get_dummies(df,columns=cat_cols)
    df.head()
    X=df.iloc[:,0:-2]
    y=df['y_yes']
    # Use sklearn to split up the dataset
    return X, y


In [3]:
@lru_cache(maxsize=None)
def load_heart():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    df = pd.read_csv("data/heart.csv")
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y

In [4]:
@lru_cache(maxsize=None)
def load_mobile():
    # https://www.kaggle.com/iabhishekofficial/mobile-price-classification
    df = pd.read_csv("data/phone_price.csv")
    print(df['price_range'])
    X = df.iloc[:,0:-1]
    y = df.iloc[:,-1]
    return X, y

In [79]:
# X, y = load_heart()
X, y = load_mobile()

# Understanding the Data

In [72]:
def save_figs(dX, y, name):
    plt.matshow(X.corr())
    plt.yticks(np.arange(X.shape[1]), X.columns)
    plt.xticks(np.arange(X.shape[1]), X.columns)
    plt.colorbar()
    plt.savefig(f'{name}_correlation.png', bbox_inches='tight')
    plt.clf()
    X.hist()
    plt.savefig(f'{name}_histogram.png', bbox_inches='tight')
    plt.clf()
    y.hist()
    plt.savefig(f'{name}_y_histogram.png', bbox_inches='tight')
    plt.clf()
make_figs(X,y,"mobile")

Shape is  (2000, 20)


<Figure size 432x432 with 0 Axes>

<Figure size 576x432 with 0 Axes>

In [75]:
print("X Shape is ",X.shape)

# Empty array that will hold our classifiers
classifiers = []

X Shape is  (303, 13)


# Decision Tree

In [80]:
# dtc = DecisionTreeClassifier(criterion='entropy', ccp_alpha=0.015)
# classifiers.append((dtc, "Decision Tree"))
# x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=12)
# # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
# path = dtc.cost_complexity_pruning_path(x_train, y_train)
# ccp_alphas, impurities = path.ccp_alphas, path.impurities
# fig, ax = plt.subplots()
# ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
# ax.set_xlabel("effective alpha")
# ax.set_ylabel("total impurity of leaves")
# ax.set_title("Total Impurity vs effective alpha for training set")
# dtc.fit(x_train,y_train)
# print(dtc.get_depth())
# print(dtc.get_params())
# print(dtc.score(x_test,y_test))

dtc = DecisionTreeClassifier()
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=12)
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test)
print(y_pred.shape)
confusion_matrix(y_test, y_pred)


(400,)


array([[92, 10,  0,  0],
       [13, 81, 11,  0],
       [ 0,  8, 72, 15],
       [ 0,  0, 12, 86]])

# Supervised Learning Algorithms 

In [49]:
classifiers = [
    (DecisionTreeClassifier(), "Decision Tree"),
    (MLPClassifier(), "Neural Network"),
    (AdaBoostClassifier(), "Boost"),
    (svm.SVC(), "SVM"),
    (KNeighborsClassifier(3), "KNN")
]
kfold = KFold(10, shuffle=True, random_state=1)
for train_index, test_index in kfold.split(X):
    for clf in classifiers:
        x_train, x_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = clf[0].fit(x_train, y_train)
        accuracy = model.score(x_test, y_test)
        print(clf[1])
        print("accuracy: {0:.3f}%".format(accuracy))


Decision Tree
accuracy: 0.810%
Neural Network
accuracy: 0.660%
Boost
accuracy: 0.665%
SVM
accuracy: 0.925%
KNN
accuracy: 0.905%
Decision Tree
accuracy: 0.860%
Neural Network
accuracy: 0.635%
Boost
accuracy: 0.695%
SVM
accuracy: 0.965%
KNN
accuracy: 0.910%
Decision Tree
accuracy: 0.855%
Neural Network
accuracy: 0.665%
Boost
accuracy: 0.685%
SVM
accuracy: 0.950%
KNN
accuracy: 0.935%
Decision Tree
accuracy: 0.845%
Neural Network
accuracy: 0.735%
Boost
accuracy: 0.685%
SVM
accuracy: 0.945%
KNN
accuracy: 0.910%
Decision Tree
accuracy: 0.845%
Neural Network
accuracy: 0.590%
Boost
accuracy: 0.805%
SVM
accuracy: 0.950%
KNN
accuracy: 0.910%
Decision Tree
accuracy: 0.850%
Neural Network
accuracy: 0.540%
Boost
accuracy: 0.800%
SVM
accuracy: 0.960%
KNN
accuracy: 0.915%
Decision Tree
accuracy: 0.860%
Neural Network
accuracy: 0.650%
Boost
accuracy: 0.820%
SVM
accuracy: 0.980%
KNN
accuracy: 0.945%
Decision Tree
accuracy: 0.845%
Neural Network
accuracy: 0.650%
Boost
accuracy: 0.800%
SVM
accuracy: 0.94