In [8]:
import pandas as pd
import numpy as np
import time as time
import statistics
from tabulate import tabulate
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [9]:
data = pd.read_csv(r'G:\My Drive\FH_Technikum\MSC\Semester_2_SS2022\DAS\ComparativeExperimentation\covtype.data', header=None)
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
count,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,...,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0,581012.0
mean,2959.365301,155.656807,14.103704,269.428217,46.418855,2350.146611,212.146049,223.318716,142.528263,1980.291226,...,0.090392,0.077716,0.002773,0.003255,0.000205,0.000513,0.026803,0.023762,0.01506,2.051471
std,279.984734,111.913721,7.488242,212.549356,58.295232,1559.25487,26.769889,19.768697,38.274529,1324.19521,...,0.286743,0.267725,0.052584,0.056957,0.01431,0.022641,0.161508,0.152307,0.121791,1.396504
min,1859.0,0.0,0.0,0.0,-173.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2809.0,58.0,9.0,108.0,7.0,1106.0,198.0,213.0,119.0,1024.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2996.0,127.0,13.0,218.0,30.0,1997.0,218.0,226.0,143.0,1710.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,3163.0,260.0,18.0,384.0,69.0,3328.0,231.0,237.0,168.0,2550.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3858.0,360.0,66.0,1397.0,601.0,7117.0,254.0,254.0,254.0,7173.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0


Split data into training and testdata as well as columns that should be predicted (y) and columns that contain data that will be used to predict (X). (= holdout method)

The columns that should be predicted (target/dependent) must be excluded from the trainingsdata to not influence the created modle.

The dependent (to be predicted) data is located in column 54 (Forest Cover Type Classes => values from 1 to 7)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data.loc[:,:53], data.loc[:,54:], test_size=0.33, random_state=547998)
print("X_train: " + str(X_train.shape))
print("X_test: " + str(X_test.shape))
print("y_train: " + str(y_train.shape))
print("y_test: " + str(y_test.shape))

X_train: (389278, 54)
X_test: (191734, 54)
y_train: (389278, 1)
y_test: (191734, 1)


## Decision Tree
I chose to vary the parameters for min_samples_splits and min_samples_leafs to see the differences between the different values since they seem to be the most promising to have an impact on the results.

In [11]:
# result analysis helper lists
training_times = []
test_times = []
accuracy_measures = []
weithged_f1_measures = []

# algo input parameter variation lists
min_samples_splits = [2, 50, 100, 1000]
min_samples_leafs = [1, 50, 100, 1000]

for min_samples_split in min_samples_splits:
    for min_samples_leaf in min_samples_leafs:
        algo = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=min_samples_split, random_state=547998)

        start_training = time.time()
        modle = algo.fit(X=X_train, y=y_train)
        training_times.append(time.time() - start_training)

        start_testing = time.time()
        y_pred = modle.predict(X=X_test)
        test_times.append(time.time() - start_testing)

        accuracy_measures.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        weithged_f1_measures.append(f1_score(y_true=y_test, y_pred=y_pred, average='weighted'))

        print("Run-parameters min_samples_split: [" + str(min_samples_split) + "] min_samples_leaf: [" + str(min_samples_leaf) + "]")
        print("-------------------------------------------")
        print("training time: " + str(training_times[-1]) + " seconds")
        print("testing time: " + str(test_times[-1]) + " seconds")

        print("accuracy: " + str(accuracy_measures[-1]))
        print("micro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        print("macro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print("weighted f-score: " + str(weithged_f1_measures[-1]))
        print("-------------------------------------------")

        # crosschecking results
        # print(classification_report(y_true=y_test, y_pred=y_pred))

mean_training_time = statistics.mean(training_times)
mean_testing_time = statistics.mean(test_times)
mean_accuracy_measure = statistics.mean(accuracy_measures)
mean_weighted_f1_measure = statistics.mean(weithged_f1_measures)

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))
print("mean accuracy_measures: " + str(mean_accuracy_measure))
print("mean weighted_f1_measures: " + str(mean_weighted_f1_measure))

dt_mean_training_time = mean_training_time
dt_mean_testing_time = mean_testing_time
dt_mean_accuracy_measure = mean_accuracy_measure
dt_mean_weighted_f1_measure = mean_weighted_f1_measure


Run-parameters min_samples_split: [2] min_samples_leaf: [1]
-------------------------------------------
training time: 5.714843034744263 seconds
testing time: 0.08300065994262695 seconds
accuracy: 0.9340805490940574
micro f-score: 0.9340805490940574
macro f-score: 0.8951065850043266
weighted f-score: 0.9340784980380927
-------------------------------------------
Run-parameters min_samples_split: [2] min_samples_leaf: [50]
-------------------------------------------
training time: 5.55415678024292 seconds
testing time: 0.07399892807006836 seconds
accuracy: 0.9340805490940574
micro f-score: 0.9340805490940574
macro f-score: 0.8951065850043266
weighted f-score: 0.9340784980380927
-------------------------------------------
Run-parameters min_samples_split: [2] min_samples_leaf: [100]
-------------------------------------------
training time: 6.591277599334717 seconds
testing time: 0.1358959674835205 seconds
accuracy: 0.9340805490940574
micro f-score: 0.9340805490940574
macro f-score: 0.89

## Perceptron
I chose to vary the value for perameter alpha according to this article (https://scikit-learn.org/stable/auto_examples/neural_networks/plot_mlp_alpha.html) and the value for parameter penalty, since the penalty for a failed attempt seems to have significant impact on the results.

In [12]:
# result analysis helper lists
training_times = []
test_times = []
accuracy_measures = []
weithged_f1_measures = []

# algo input parameter variation lists
alphas = np.logspace(-1, 1, 5)
penalties = ['l2', 'l1']

for alpha in alphas:
    for penalty in penalties:
        algo = Perceptron(alpha=alpha, penalty=penalty, random_state=547998)

        start_training = time.time()
        modle = algo.fit(X=X_train, y=y_train.values.ravel())
        training_times.append(time.time() - start_training)

        start_testing = time.time()
        y_pred = modle.predict(X=X_test)
        test_times.append(time.time() - start_testing)

        accuracy_measures.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        weithged_f1_measures.append(f1_score(y_true=y_test, y_pred=y_pred, average='weighted'))

        print("Run-parameters penaltiy: [" + str(penalty) + "] alpha: [" + str(alpha) + "]")
        print("-------------------------------------------")
        print("training time: " + str(training_times[-1]) + " seconds")
        print("testing time: " + str(test_times[-1]) + " seconds")

        print("accuracy: " + str(accuracy_measures[-1]))
        print("micro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
        print("macro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
        print("weighted f-score: " + str(weithged_f1_measures[-1]))
        print("-------------------------------------------")

        # crosschecking results
        # print(classification_report(y_true=y_test, y_pred=y_pred))

mean_training_time = statistics.mean(training_times)
mean_testing_time = statistics.mean(test_times)
mean_accuracy_measure = statistics.mean(accuracy_measures)
mean_weighted_f1_measure = statistics.mean(weithged_f1_measures)

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))
print("mean accuracy_measures: " + str(mean_accuracy_measure))
print("mean weighted_f1_measures: " + str(mean_weighted_f1_measure))

p_mean_training_time = mean_training_time
p_mean_testing_time = mean_testing_time
p_mean_accuracy_measure = mean_accuracy_measure
p_mean_weighted_f1_measure = mean_weighted_f1_measure

Run-parameters penaltiy: [l2] alpha: [0.1]
-------------------------------------------
training time: 15.187188148498535 seconds
testing time: 0.07785916328430176 seconds
accuracy: 0.3645206379671837
micro f-score: 0.36452063796718376
macro f-score: 0.07632625759333833
weighted f-score: 0.19475747278099903
-------------------------------------------
Run-parameters penaltiy: [l1] alpha: [0.1]
-------------------------------------------
training time: 15.86253547668457 seconds
testing time: 0.08685779571533203 seconds
accuracy: 0.464388162767167
micro f-score: 0.464388162767167
macro f-score: 0.19139712133510403
weighted f-score: 0.5018217165406026
-------------------------------------------
Run-parameters penaltiy: [l2] alpha: [0.31622776601683794]
-------------------------------------------
training time: 11.704886674880981 seconds
testing time: 0.07594466209411621 seconds
accuracy: 0.3645206379671837
micro f-score: 0.36452063796718376
macro f-score: 0.07632625759333833
weighted f-scor

The perceptron so far exhibits the worst performance. Accuracy is considerably lower than with the Decision Tree method. Also, execution times are higher.

## K-Nearest Neighbors

When running tests with different algorithms, kd-tree algorithm worked the best (fastest). Other algorithms took too long to be reasonably evaluated.

In [13]:
# result analysis helper lists
training_times = []
test_times = []
accuracy_measures = []
weithged_f1_measures = []

# algo input parameter variation lists
neighbors = [3, 5, 10]

for n_neighbors in neighbors:
    algo = KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree')

    start_training = time.time()
    modle = algo.fit(X=X_train, y=y_train.values.ravel())
    training_times.append(time.time() - start_training)

    start_testing = time.time()
    y_pred = modle.predict(X=X_test)
    test_times.append(time.time() - start_testing)

    accuracy_measures.append(accuracy_score(y_true=y_test, y_pred=y_pred))
    weithged_f1_measures.append(f1_score(y_true=y_test, y_pred=y_pred, average='weighted'))
    
    print("Run-parameters n_neighbors: [" + str(n_neighbors) + "]")
    print("-------------------------------------------")
    print("training time: " + str(training_times[-1]) + " seconds")
    print("testing time: " + str(test_times[-1]) + " seconds")

    print("accuracy: " + str(accuracy_measures[-1]))
    print("micro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='micro')))
    print("macro f-score: " + str(f1_score(y_true=y_test, y_pred=y_pred, average='macro')))
    print("weighted f-score: " + str(weithged_f1_measures[-1]))
    print("-------------------------------------------")
    # crosschecking results
    # print(classification_report(y_true=y_test, y_pred=y_pred))

mean_training_time = statistics.mean(training_times)
mean_testing_time = statistics.mean(test_times)
mean_accuracy_measure = statistics.mean(accuracy_measures)
mean_weighted_f1_measure = statistics.mean(weithged_f1_measures)

print("mean training time: " + str(mean_training_time))
print("mean testing time: " + str(mean_testing_time))
print("mean accuracy_measures: " + str(mean_accuracy_measure))
print("mean weighted_f1_measures: " + str(mean_weighted_f1_measure))

knn_mean_training_time = mean_training_time
knn_mean_testing_time = mean_testing_time
knn_mean_accuracy_measure = mean_accuracy_measure
knn_mean_weighted_f1_measure = mean_weighted_f1_measure

Run-parameters n_neighbors: [3]
-------------------------------------------
training time: 13.0683274269104 seconds
testing time: 22.03102207183838 seconds
accuracy: 0.9665421886572022
micro f-score: 0.9665421886572021
macro f-score: 0.9378683335123738
weighted f-score: 0.966517381946269
-------------------------------------------
Run-parameters n_neighbors: [5]
-------------------------------------------
training time: 13.078352451324463 seconds
testing time: 25.414071559906006 seconds
accuracy: 0.9656086035862184
micro f-score: 0.9656086035862184
macro f-score: 0.933485280191597
weighted f-score: 0.9655512211595585
-------------------------------------------
Run-parameters n_neighbors: [10]
-------------------------------------------
training time: 13.829754114151001 seconds
testing time: 34.75074744224548 seconds
accuracy: 0.9555269279314049
micro f-score: 0.9555269279314049
macro f-score: 0.9157449553320155
weighted f-score: 0.9553630441510249
--------------------------------------

K-Nearest Neighbor produces the best results consistently. The best accuracy can be achieved with 3 nearest neighbors. Execution time is however the longest. But with kd-tree algorithm, execution times are manageable.

In [16]:
headers = ["Coverage", "Accuracy", "F1", "Training time", "Testing time"]

table_data = [
    ["K-NN", str(knn_mean_accuracy_measure), str(knn_mean_weighted_f1_measure), str(knn_mean_training_time), str(knn_mean_testing_time)],
    ["Perceptron", str(p_mean_accuracy_measure), str(p_mean_weighted_f1_measure), str(p_mean_training_time), str(p_mean_testing_time)],
    ["Decision Tree", str(dt_mean_accuracy_measure), str(dt_mean_weighted_f1_measure), str(dt_mean_training_time), str(dt_mean_testing_time)],
]

print(tabulate(table_data, headers=headers, tablefmt="grid"))

+---------------+------------+----------+-----------------+----------------+
| Coverage      |   Accuracy |       F1 |   Training time |   Testing time |
| K-NN          |   0.962559 | 0.962477 |        13.3255  |     27.3986    |
+---------------+------------+----------+-----------------+----------------+
| Perceptron    |   0.323029 | 0.234015 |        16.8397  |      0.0859736 |
+---------------+------------+----------+-----------------+----------------+
| Decision Tree |   0.877266 | 0.876231 |         5.99909 |      0.0807874 |
+---------------+------------+----------+-----------------+----------------+


It is obvious that K-NN is the slowest and Perceptron produces the model with the least accurate predictions. K-NN has both, long testing and training times. However, K-NN also creates the model which classifies best.