In [None]:
!pip install scipy numpy matplotlib pandas sklearn > /dev/null

In [None]:
# Load libraries
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, plot_roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import json

<h2>Dataset configuration</h2>

In [None]:
class Config(object):
    def __init__(self, dataSourceUrl, test_size, n_splits, should_describe_data):
        self.dataSourceUrl = dataSourceUrl
        self.test_size = test_size
        self.n_splits = n_splits
        self.should_describe_data = should_describe_data

def as_config(dict):
    dataSetName = dict['chosenDataSet']
    return Config(
        dict[dataSetName]['dataSourceUrl'],
        dict[dataSetName]['test_size'],
        dict[dataSetName]['n_splits'],
        dict[dataSetName]['should_describe_data'],
    )

In [None]:
json_config = """
{
    "chosenDataSet": "cancer",
    "iris":{
        "dataSourceUrl": "./data/iris.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "wine-quality":{
        "dataSourceUrl": "./data/winequality-white.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "glass": {
        "dataSourceUrl": "./data/glass/glass.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "cancer": {
        "dataSourceUrl": "./data/cancer.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    },
    "titanic": {
        "dataSourceUrl": "./data/titanic.csv",
        "test_size": 0.2,
        "n_splits": 10,
        "should_describe_data": true
    }
}"""
cfg = json.loads(json_config)
cfg = as_config(cfg)

In [None]:
def plot_show():
    pyplot.draw()
    pyplot.pause(0.1)

def get_specificity(y_validate, y_predicted):
    tn, fp, fn, tp = confusion_matrix(y_validate, y_predicted).ravel()
    return tn / (tn + fp)
    
def get_learning_curve(classification_model, name_of_model, x_train, y_train, training_set_enlarging_step=10, scoring="accuracy", number_of_cv_generations=10):
    train_sizes = np.linspace(0.1, 1, training_set_enlarging_step)
    train_sizes, train_scores, test_scores = \
    learning_curve(classification_model, x_train, y_train, train_sizes=train_sizes, scoring=scoring, cv=number_of_cv_generations)
    
    plt.figure()
    plt.plot(train_sizes, -test_scores.mean(1), 'o-', color="r",
         label=name_of_model)
    plt.xlabel("Train size")
    plt.ylabel("Mean Squared Error")
    plt.title('Learning curves %s' % (name))
    plt.legend(loc="best")
    plt.show()

<h2>Load (and describe) dataset</h2>

In [None]:
dataset = read_csv(cfg.dataSourceUrl, header=0)

if cfg.should_describe_data:    
    print(dataset.shape)
    # print(dataset.head(20))
    #describe each column
    # print(dataset.describe())
    classColumnName = dataset.columns[-1]
    #print avaiable classes
    print(dataset.groupby(classColumnName).size())

In [None]:
# dataset.plot(kind='box', subplots=True, sharex=False, sharey=False)
# self.plot_show()

# scatter_matrix(dataset)
# self.plot_show()


<h2>Creating train and test arrays</h2>

In [None]:
array = dataset.values
x = array[:,0:len(dataset.columns)-1]
y = array[:,len(dataset.columns)-1]
#na podstawie x i y otrzymujemy tablice testowe i wynikowe
x_train, x_validation, y_train, y_validation = train_test_split(x,y, test_size=cfg.test_size, random_state=1)

<h2>Classification models</h2>

In [None]:
models = []
models.extend([
    ('KNN', KNeighborsClassifier(), 0),
    ('CART', DecisionTreeClassifier(), 1),
    ('NB', GaussianNB(), 2),
    ('SVM', SVC(gamma='auto'), 3),
    ('MLP', MLPClassifier(alpha=1e-5, hidden_layer_sizes=(50,10), max_iter=5000), 4)
    ])

<h2>Classification</h2>
kfold - k cross-validation to algorytm polegający na testowaniu nauczania(sprawdzania jego wydajności). 
Zbiór TESTOWY jest dzielony na K podzbiorów. W każdej z k iteracji,
brane jest k-1 pozdbiorów, następuje ich nauczanie, następnie sprawdzenie 'jakości' nauczonego modelu.
Przy pomocy danego algorytmu uczenia maszynowego!

In [None]:
roc_values = []
results = []
names = []
for name, model, subplot_row in models:
            print(f"---------------------------\nRunning classification for: {name}")
            kfold = StratifiedKFold(n_splits=cfg.n_splits, random_state=1, shuffle=True)
            cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
            results.append(cv_results)
            names.append(name)
            
            print('\nMean %f' % cv_results.mean())
            print('STD %f' % cv_results.std())

            # Make predictions on validation dataset
            model.fit(x_train, y_train)
            predictions = model.predict(x_validation)
            
            print('\nConfusion matrix:')
            print(confusion_matrix(y_validation, predictions))
            
            print('\nAccuracy %f' % accuracy_score(y_validation, predictions))
            print('Precision %f' % precision_score(y_validation, predictions))
            print('Recall %f' % recall_score(y_validation, predictions))
            print('Specificity %f' % get_specificity(y_validation, predictions))

            print('\nClassification report:')
            print(classification_report(y_validation, predictions))
            
            # get_learning_curve(model, name, x_train, y_train)
    
            #ROC CURVE
            # plot_roc_curve(model, x_validation, y_validation)
            auc = roc_auc_score(y_validation, predictions)
            fpr, tpr, _ = roc_curve(y_validation, predictions)
            roc_values.append([name, fpr, tpr, auc])

plt.figure()
for x in roc_values:
    pyplot.plot(x[1], x[2], label="%s AUC: %f" % (x[0], x[3]))    
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

# Compare Algorithms
fig = pyplot.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(1,1,1)
ax.set_title("Algorithm Comparison")
ax.boxplot(results, labels=names)
plot_show()