In [None]:
!pip install scipy numpy matplotlib pandas sklearn > /dev/null

In [None]:
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn
# Load libraries
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import json

<h2>Dataset configuration</h2>

In [None]:
class Config(object):
    def __init__(self, dataSourceUrl, test_size, n_splits, should_describe_data):
        self.dataSourceUrl = dataSourceUrl
        self.test_size = test_size
        self.n_splits = n_splits
        self.should_describe_data = should_describe_data

def as_config(dict):
    dataSetName = dict['chosenDataSet']
    return Config(
        dict[dataSetName]['dataSourceUrl'],
        dict[dataSetName]['test_size'],
        dict[dataSetName]['n_splits'],
        dict[dataSetName]['should_describe_data'],
    )

In [None]:
json_config = """
{
    "chosenDataSet": "cancer",
    "iris":{
        "dataSourceUrl": "./data/iris.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "wine-quality":{
        "dataSourceUrl": "./data/winequality-white.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "glass": {
        "dataSourceUrl": "./data/glass/glass.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "cancer": {
        "dataSourceUrl": "./data/cancer.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "titanic": {
        "dataSourceUrl": "./data/titanic.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    }
}"""
cfg = json.loads(json_config)
cfg = as_config(cfg)

In [None]:
def plot_show():
    pyplot.draw()
    pyplot.pause(0.1)

<h2>Load (and describe) dataset</h2>

In [None]:
dataset = read_csv(cfg.dataSourceUrl, header=0)

if cfg.should_describe_data:    
    print(dataset.shape)
    # print(dataset.head(20))
    #describe each column
    # print(dataset.describe())
    classColumnName = dataset.columns[-1]
    #print avaiable classes
    print(dataset.groupby(classColumnName).size())

In [None]:
# dataset.plot(kind='box', subplots=True, sharex=False, sharey=False)
# self.plot_show()

# scatter_matrix(dataset)
# self.plot_show()


<h2>Creating train and test arrays</h2>

In [None]:
array = dataset.values
x = array[:,0:len(dataset.columns)-1]
y = array[:,len(dataset.columns)-1]
#na podstawie x i y otrzymujemy tablice testowe i wynikowe
x_train, x_validation, y_train, y_validation = train_test_split(x,y, test_size=cfg.test_size, random_state=1)

<h2>Classification models</h2>

In [None]:
models = []
models.extend([
    ('KNN', KNeighborsClassifier(), 0),
    ('CART', DecisionTreeClassifier(), 1),
    ('NB', GaussianNB(), 2),
    ('SVM', SVC(gamma='auto'), 3),
    ('MLP', MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,10), max_iter=5000), 4)
    ])

<h2>Classification</h2>

In [None]:
results = []
names = []
fig, axes = pyplot.subplots(5, 2, sharex=True, sharey=True, figsize=(20,5), gridspec_kw={'hspace': 1, 'wspace': 1})
fig.suptitle("Confusion matrices")
for name, model, subplot_row in models:
            print(f"---------------------------\nRunning classification for: {name}")
            kfold = StratifiedKFold(n_splits=cfg.n_splits, random_state=1, shuffle=True)
            cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
            results.append(cv_results)
            names.append(name)
            print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
            # Make predictions on validation dataset
            model.fit(x_train, y_train)
            predictions = model.predict(x_validation)

            print(accuracy_score(y_validation, predictions))
            print(confusion_matrix(y_validation, predictions))
            titles_options = [(f"{name}: CF", None, 0), (f"{name}: normalized CF", 'true', 1)]
            for title, normalize, subplot_num in titles_options:
                disp = plot_confusion_matrix(model, x_validation, y_validation,
                                            cmap=pyplot.cm.Blues,
                                            ax=axes[subplot_row, subplot_num],
                                            normalize=normalize)
                disp.ax_.set_title(title)
                plot_show()
            print(classification_report(y_validation, predictions))


# Compare Algorithms
fig = pyplot.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(1,1,1)
ax.set_title("Algorithm Comparison")
ax.boxplot(results, labels=names)
plot_show()
input("Press Enter to continue")
