In [574]:
import time as time
from tabulate import tabulate

import pandas as pd
from pandas import DataFrame

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import svm

# Data Collection & Pre-Processing

In [575]:
breast_cancer = pd.read_csv(r'G:\My Drive\FH_Technikum\MSC\Semester_2_SS2022\MLE\Data\breast-cancer-diagnostic.shuf.lrn.csv')
breast_cancer_data = breast_cancer.loc[:, ~breast_cancer.columns.isin(['ID', 'class'])]
breast_cancer_class = breast_cancer.loc[:, breast_cancer.columns == 'class']
breast_cancer_ids = breast_cancer.loc[:, breast_cancer.columns == 'ID']

# Encoding classes (true, false) to numerical values
le_breast_cancer_class = preprocessing.LabelEncoder()
breast_cancer_class = le_breast_cancer_class.fit_transform(breast_cancer_class.values.ravel())
breast_cancer_class = DataFrame(breast_cancer_class)

In [576]:
stroke = pd.read_csv(r'G:\My Drive\FH_Technikum\MSC\Semester_2_SS2022\MLE\Data\stroke.shuf.lrn.csv')
stroke['bmi'] = stroke['bmi'].fillna(0)

stroke_data = stroke.loc[:, ~stroke.columns.isin(['ID', 'stroke'])]
stroke_class = stroke.loc[:, stroke.columns == 'stroke']
stroke_ids = stroke.loc[:, stroke.columns == 'ID']

# Encoding classes (true, false) to numerical values
le_stroke_class = preprocessing.LabelEncoder()
stroke_class = le_stroke_class.fit_transform(stroke_class.values.ravel())
stroke_class = DataFrame(stroke_class)

# Encoding categorical string values
le_gender = preprocessing.LabelEncoder()
stroke_data.loc[:,'gender'] = le_gender.fit_transform(stroke_data['gender'].values.ravel())

le_gender = preprocessing.LabelEncoder()
stroke_data.loc[:,'ever_married'] = le_gender.fit_transform(stroke_data['ever_married'].values.ravel())

work_data = pd.get_dummies(stroke_data['work_type'], prefix='work')
stroke_data = stroke_data.loc[:, stroke_data.columns != 'work_type']
stroke_data = stroke_data.join(work_data)

residence_data = pd.get_dummies(stroke_data['Residence_type'], prefix='residence')
stroke_data = stroke_data.loc[:, stroke_data.columns != 'Residence_type']
stroke_data = stroke_data.join(residence_data)

smoking_data = pd.get_dummies(stroke_data['smoking_status'], prefix='smoking')
stroke_data = stroke_data.loc[:, stroke_data.columns != 'smoking_status']
stroke_data = stroke_data.join(smoking_data)

len(stroke)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_data.loc[:,'gender'] = le_gender.fit_transform(stroke_data['gender'].values.ravel())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stroke_data.loc[:,'ever_married'] = le_gender.fit_transform(stroke_data['ever_married'].values.ravel())


2555

# Preparation for running loops

In [577]:
# These are our feature sets; we will use each of them individually to train classifiers
trainingSets = [
    {'data': breast_cancer_data, 'class': breast_cancer_class, 'title': 'Breast Cancer'},
    {'data': stroke_data, 'class': stroke_class, 'title': 'Stroke'}
]

# Classification

## K-NN

In [578]:
classifier_results_table_headers = ['Classifier', 'Parameters', 'Dataset', 'Training time (seconds)', 'Test time (seconds)', 'Accuracy', 'Weighted F1', 'Predicted values', 'Test values']
measurements = DataFrame(columns=classifier_results_table_headers)

In [579]:
classifier_name = 'K-NN'

for trainingSet in trainingSets:
    print("Processing training set: " + str(trainingSet['title']))

    X_train, X_test, y_train, y_test = train_test_split(trainingSet['data'], trainingSet['class'], test_size=0.33, random_state=547998)

    training_times = []
    test_times = []
    accuracy_measures = []
    weithged_f1_measures = []
    neighbors_params = [3, 5, 10]

    for n_neighbors in neighbors_params:
        classifier = neighbors.KNeighborsClassifier(n_neighbors = n_neighbors)

        start_training = time.time()
        modle = classifier.fit(X=X_train, y=y_train.values.ravel())
        training_time = time.time() - start_training

        start_testing = time.time()
        y_pred = modle.predict(X=X_test)
        test_time = time.time() - start_testing

        accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
        f1_measure = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')

        measurements.loc[len(measurements)] = [ classifier_name, 'neighbors: ' + str(n_neighbors), trainingSet['title'], training_time, test_time, accuracy, f1_measure, y_pred, y_test]


Processing training set: Breast Cancer
Processing training set: Stroke


## Perceptron

In [580]:
classifier_name = 'Perceptron'

for trainingSet in trainingSets:
    print("Processing training set: " + str(trainingSet['title']))
    best_f1 = 0
    
    X_train, X_test, y_train, y_test = train_test_split(trainingSet['data'], trainingSet['class'], test_size=0.33, random_state=547998)
            
    training_times = []
    test_times = []
    accuracy_measures = []
    weithged_f1_measures = []
    alphas = np.logspace(-1, 1, 5)
    penalties = ['l2', 'l1']

    for alpha in alphas:
        for penalty in penalties:
            classifier = linear_model.Perceptron(alpha=alpha, penalty=penalty, random_state=547998)

            start_training = time.time()
            modle = classifier.fit(X=X_train, y=y_train.values.ravel())
            training_time = time.time() - start_training

            start_testing = time.time()
            y_pred = modle.predict(X=X_test)
            test_time = time.time() - start_testing
            
            accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
            f1_measure = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')

            measurements.loc[len(measurements)] = [ classifier_name, 'alpha: ' + str(alpha) + ' penalty: ' + str(penalty), trainingSet['title'], training_time, test_time, accuracy, f1_measure, y_pred, y_test]

Processing training set: Breast Cancer
Processing training set: Stroke


## Decision Tree

In [581]:
classifier_name = 'Decision Tree'

for trainingSet in trainingSets:
    print("Processing training set: " + str(trainingSet['title']))
    
    X_train, X_test, y_train, y_test = train_test_split(trainingSet['data'], trainingSet['class'], test_size=0.33, random_state=547998)
            
    training_times = []
    test_times = []
    accuracy_measures = []
    weithged_f1_measures = []
    min_samples_splits = [2, 50, 100, 1000]
    min_samples_leafs = [1, 50, 100, 1000]

    for min_samples_split in min_samples_splits:
        for min_samples_leaf in min_samples_leafs:
            classifier = tree.DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=547998)

            start_training = time.time()
            modle = classifier.fit(X=X_train, y=y_train.values.ravel())
            training_time = time.time() - start_training

            start_testing = time.time()
            y_pred = modle.predict(X=X_test)
            test_time = time.time() - start_testing
            
            accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
            f1_measure = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')

            measurements.loc[len(measurements)] = [ classifier_name, 'min. samples/split: ' + str(min_samples_split) + ' min. samples/leaf: ' + str(min_samples_leaf), trainingSet['title'], training_time, test_time, accuracy, f1_measure, y_pred, y_test]

Processing training set: Breast Cancer
Processing training set: Stroke


## Random Forest

In [582]:
classifier_name = 'Random Forest'

for trainingSet in trainingSets:
    print("Processing training set: " + str(trainingSet['title']))
    
    X_train, X_test, y_train, y_test = train_test_split(trainingSet['data'], trainingSet['class'], test_size=0.33, random_state=547998)
            
    training_times = []
    test_times = []
    accuracy_measures = []
    weithged_f1_measures = []
    n_estimators = [20, 100] # = Number of trees
    max_features = ['sqrt', 'log2']

    for n_estimators_value in n_estimators:
        for max_features_value in max_features:
            classifier = ensemble.RandomForestClassifier(n_estimators=n_estimators_value, max_features=max_features_value, random_state=547998)

            start_training = time.time()
            modle = classifier.fit(X=X_train, y=y_train.values.ravel())
            training_time = time.time() - start_training

            start_testing = time.time()
            y_pred = modle.predict(X=X_test)
            test_time = time.time() - start_testing
            
            accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
            f1_measure = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')


            measurements.loc[len(measurements)] = [ classifier_name, 'n estimators: ' + str(n_estimators_value) + ' max. feaatures: ' + str(max_features_value), trainingSet['title'], training_time, test_time, accuracy, f1_measure, y_pred, y_test]

Processing training set: Breast Cancer
Processing training set: Stroke


## SVM (SVC-Classifier)

In [583]:
classifier_name = 'SVC'

for trainingSet in trainingSets:
    print("Processing training set: " + str(trainingSet['title']))

    X_train, X_test, y_train, y_test = train_test_split(trainingSet['data'], trainingSet['class'], test_size=0.33, random_state=547998)

    training_times = []
    test_times = []
    accuracy_measures = []
    weithged_f1_measures = []

    classifier = svm.SVC(random_state=547998)

    start_training = time.time()
    modle = classifier.fit(X=X_train, y=y_train.values.ravel())
    training_time = time.time() - start_training

    start_testing = time.time()
    y_pred = modle.predict(X=X_test)
    test_time = time.time() - start_testing
    
    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    f1_measure = f1_score(y_true=y_test, y_pred=y_pred, average='weighted')

    measurements.loc[len(measurements)] = [ classifier_name, 'SVC default', trainingSet['title'], training_time, test_time, accuracy, f1_measure, y_pred, y_test]

Processing training set: Breast Cancer
Processing training set: Stroke


## Results

In [584]:
measurements_table_data = measurements.iloc[:, :7]
decimals = 5
measurements_table_data['Training time (seconds)'] = measurements_table_data['Training time (seconds)'].round(decimals)
measurements_table_data['Test time (seconds)'] = measurements_table_data['Test time (seconds)'].round(decimals)
measurements_table_data['Accuracy'] = measurements_table_data['Accuracy'].round(decimals)
measurements_table_data['Weighted F1'] = measurements_table_data['Weighted F1'].round(decimals)
print(tabulate(measurements_table_data, headers='keys', tablefmt='orgtbl'))

|    | Classifier    | Parameters                                       | Dataset       |   Training time (seconds) |   Test time (seconds) |   Accuracy |   Weighted F1 |
|----+---------------+--------------------------------------------------+---------------+---------------------------+-----------------------+------------+---------------|
|  0 | K-NN          | neighbors: 3                                     | Breast Cancer |                   0.00107 |               0.00317 |    0.89474 |       0.89443 |
|  1 | K-NN          | neighbors: 5                                     | Breast Cancer |                   0.001   |               0.002   |    0.85263 |       0.8522  |
|  2 | K-NN          | neighbors: 10                                    | Breast Cancer |                   0.00102 |               0.00199 |    0.87368 |       0.87221 |
|  3 | K-NN          | neighbors: 3                                     | Stroke        |                   0.00102 |               0.03899 |    

In [585]:
measurements_table_data.to_csv('C:/Users/mailb/Desktop/results.csv', sep=';', index=False, decimal=',')

# Retrain

## Perceptron & Stroke Dataset

Retraining the stroke dataset with the K-NN Classifier using 10 neighbors as this yielded the best results

In [586]:
retrain_stroke_test = pd.read_csv(r'G:\My Drive\FH_Technikum\MSC\Semester_2_SS2022\MLE\Data\stroke.shuf.tes.csv')
retrain_stroke_test = retrain_stroke_test.fillna(0)

retrain_stroke_test_data = retrain_stroke_test.loc[:, ~retrain_stroke_test.columns.isin(['ID'])]
retrain_stroke_test_ids = retrain_stroke_test.loc[:, retrain_stroke_test.columns == 'ID']

# Encoding categorical string values
le_gender = preprocessing.LabelEncoder()
retrain_stroke_test_data.loc[:,'gender'] = le_gender.fit_transform(retrain_stroke_test_data['gender'].values.ravel())

le_gender = preprocessing.LabelEncoder()
retrain_stroke_test_data.loc[:,'ever_married'] = le_gender.fit_transform(retrain_stroke_test_data['ever_married'].values.ravel())

work_data = pd.get_dummies(retrain_stroke_test_data['work_type'], prefix='work')
retrain_stroke_test_data = retrain_stroke_test_data.loc[:, retrain_stroke_test_data.columns != 'work_type']
retrain_stroke_test_data = retrain_stroke_test_data.join(work_data)

residence_data = pd.get_dummies(retrain_stroke_test_data['Residence_type'], prefix='residence')
retrain_stroke_test_data = retrain_stroke_test_data.loc[:, retrain_stroke_test_data.columns != 'Residence_type']
retrain_stroke_test_data = retrain_stroke_test_data.join(residence_data)

smoking_data = pd.get_dummies(retrain_stroke_test_data['smoking_status'], prefix='smoking')
retrain_stroke_test_data = retrain_stroke_test_data.loc[:, retrain_stroke_test_data.columns != 'smoking_status']
retrain_stroke_test_data = retrain_stroke_test_data.join(smoking_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  retrain_stroke_test_data.loc[:,'gender'] = le_gender.fit_transform(retrain_stroke_test_data['gender'].values.ravel())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  retrain_stroke_test_data.loc[:,'ever_married'] = le_gender.fit_transform(retrain_stroke_test_data['ever_married'].values.ravel())


In [587]:
classifier = tree.DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=100, min_samples_leaf=1, random_state=547998)

trainingSet = trainingSets[1]

modle = classifier.fit(X=trainingSet['data'], y=trainingSet['class'].values.ravel())

y_pred = modle.predict(X=retrain_stroke_test_data)

retrain_stroke_prediction = DataFrame(index=retrain_stroke_test_ids.ID)
retrain_stroke_prediction['stroke'] = le_stroke_class.inverse_transform(y_pred)

retrain_stroke_prediction.to_csv('C:/Users/mailb/Desktop/stroke_predictions.csv', sep=',', index=True, decimal=',', index_label='ID')


## Random Forest & Cancer Dataset

Retraining the cancer dataset with the Random Forest Classifier using n-estimators 20 and max. features log2 as parameters.

In [588]:
retrain_cancer_test = pd.read_csv(r'G:\My Drive\FH_Technikum\MSC\Semester_2_SS2022\MLE\Data\breast-cancer-diagnostic.shuf.tes.csv')
retrain_cancer_test_data = retrain_cancer_test.loc[:, ~retrain_cancer_test.columns.isin(['ID'])]
retrain_cancer_test_ids = retrain_cancer_test.loc[:, retrain_cancer_test.columns == 'ID']

In [589]:
classifier = ensemble.RandomForestClassifier(n_estimators=20, max_features='log2', random_state=547998)

trainingSet = trainingSets[0]

modle = classifier.fit(X=trainingSet['data'], y=trainingSet['class'].values.ravel())

y_pred = modle.predict(X=retrain_cancer_test_data)

retrain_stroke_prediction = DataFrame(index=retrain_cancer_test_ids.ID)
retrain_stroke_prediction['class'] = le_breast_cancer_class.inverse_transform(y_pred)

retrain_stroke_prediction.to_csv('C:/Users/mailb/Desktop/cancer_predictions.csv', sep=',', index=True, decimal=',', index_label='ID')