# Data Classification

## Imports

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import sklearn as sk
import time
from sklearn import metrics, tree, neighbors
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Reading dataset

In [2]:
# read data from file
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data"
headers = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'Class']
dataset = pd.read_csv(dataset_url, sep=',', names=headers)
# display(dataset)

## Data Balancing

In [3]:
# Data Balancing
h_samples_balanced = dataset[dataset['Class'] == 'h']

# select 6688 random samples with class = g
g_class_samples = dataset[dataset['Class'] == 'g']
g_samples_balanced = g_class_samples.sample(n = 6688)

# display(g_samples)
# display(h_samples)

## Data Split

In [4]:
# Data Split
# split the g class samples to 70% and 30% for training & testing respectively 
g_70_percent = g_samples_balanced.sample(frac = 0.7)
g_30_percent = g_samples_balanced.drop(g_70_percent.index)

# split the h class samples to 70% and 30% for training & testing respectively 
h_70_percent = h_samples_balanced.sample(frac = 0.7)
h_30_percent = h_samples_balanced.drop(h_70_percent.index)

# concatenating the 70% of g-class and h-class to form the training set
training_set = pd.concat([g_70_percent, h_70_percent], axis=0, ignore_index=True)

# concatenating the 30% of g-class and h-class to form the testing set
testing_set = pd.concat([g_30_percent, h_30_percent], axis=0, ignore_index=True)

# display(training_set)
# display(testing_set)

training_data = training_set.iloc[:,:-1]
training_class = training_set.iloc[:,-1]
# display(training_data)
# display(training_class)

testing_data = testing_set.iloc[:,:-1]
testing_class = testing_set.iloc[:,-1]
# display(testing_data)
# display(testing_class)

## Classification

### >> Confusion matrix calculation

In [5]:
# 'g' >> positive ,, 'h' >> negative
def conf_matrix_calculations(actual, predicted):
    TP = TN = FP = FN = 0
    training_dataset_size = 4012
    for i in range (training_dataset_size):
        if actual[i] == 'g' and predicted[i] == 'g':
            TP += 1
        elif actual[i] == 'g' and predicted[i] == 'h':
            FN += 1
        elif actual[i] == 'h' and predicted[i] == 'h':
            TN += 1
        elif actual[i] == 'h' and predicted[i] == 'g':
            FP += 1
    print('--- Confusion Matrix ---')
    print('TP: ', TP, '\tFP: ', FP)
    print('FN: ', FN, '\tTN: ', TN)
    print('Accuracy = ', accuracy(TP, FP, FN, TN))
    p = precision(TP, FP)
    print('Precision = ', p)
    r = recall(TP, FN)
    print('Recall = ', r)
    print('F-score = ', f_score(p, r))
    

In [6]:
def accuracy(TP, FP, FN, TN):
    return (TP + TN) / (TP + FP + FN + TN)

In [7]:
def precision(TP, FP):
    return (TP) / (TP + FP)

In [8]:
def recall(TP, FN):
    return (TP) / (TP + FN)

In [9]:
def f_score(p, r):
    return (2 * p * r) / (p + r)

### >> Cross Validation

In [10]:
def get_max_accuracy(classifier_type, p_min, p_max, step, dataset_data, dataset_class):
    max_accuracy = 0
    p_max_acc = p_min
    for i in range (p_min, p_max + 1, step):
        if classifier_type == 'knn':
            classifier = neighbors.KNeighborsClassifier(i)
        elif classifier_type == 'ada':
            classifier = AdaBoostClassifier(n_estimators=i)
        elif classifier_type == 'rf':
            classifier = RandomForestClassifier(n_estimators=i)
        accuracy = cross_validation(classifier, dataset_data, dataset_class)
        if max_accuracy < accuracy:
            max_accuracy = accuracy
            p_max_acc = i
            print('updated>> i= ', i, ', accuracy= ', accuracy)
        else:
            print('Not updated>> i= ', i, ', accuracy= ', accuracy)
    return p_max_acc

In [11]:
def cross_validation(classifier, dataset_data, dataset_class):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(classifier, dataset_data, dataset_class, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

### >> Decision Trees

In [12]:
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(training_data, training_class)
dt_predicted_class = dt_classifier.predict(testing_data)

print('Decision Tree Classifier')
conf_matrix_calculations(testing_class, dt_predicted_class)

Decision Tree Classifier
--- Confusion Matrix ---
TP:  1580 	FP:  432
FN:  426 	TN:  1574
Accuracy =  0.7861415752741775
Precision =  0.7852882703777336
Recall =  0.7876370887337986
F-score =  0.786460925833748


### >> Naive Bayes

In [13]:
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(training_data, training_class)
nb_predicted_class = nb_classifier.predict(testing_data)

print('Naive Bayes Classifier')
conf_matrix_calculations(testing_class, nb_predicted_class)


Naive Bayes Classifier
--- Confusion Matrix ---
TP:  1798 	FP:  1222
FN:  208 	TN:  784
Accuracy =  0.6435692921236291
Precision =  0.595364238410596
Recall =  0.8963110667996012
F-score =  0.7154795065658576


### >> AdaBoost

In [14]:
print('AdaBoost Classifier')
ada_p_min = 95
ada_p_max = 110
ada_step = 2
start_time = time.time()
ada_parameter = get_max_accuracy('ada',ada_p_min, ada_p_max, ada_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-estimators value: ', ada_parameter)
start_time = time.time()
ada_classifier = AdaBoostClassifier(n_estimators=ada_parameter)
ada_classifier = ada_classifier.fit(training_data, training_class)
ada_predicted_class = ada_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, ada_predicted_class)


AdaBoost Classifier
updated>> i=  95 , accuracy=  0.8159985040454625
Not updated>> i=  97 , accuracy=  0.8157844867690118
updated>> i=  99 , accuracy=  0.8167459112096251
Not updated>> i=  101 , accuracy=  0.8162122932800628
Not updated>> i=  103 , accuracy=  0.8159990741500881
Not updated>> i=  105 , accuracy=  0.8162121792591377
Not updated>> i=  107 , accuracy=  0.8166396437074133
Not updated>> i=  109 , accuracy=  0.8159986180663875
Cross Validation: 103.225515127182 seconds
Used n-estimators value:  99
Classification: 3.098869800567627 seconds
--- Confusion Matrix ---
TP:  1663 	FP:  383
FN:  343 	TN:  1623
Accuracy =  0.8190428713858424
Precision =  0.8128054740957967
Recall =  0.82901296111665
F-score =  0.8208292201382034


### >> K-Nearest Neighbors (K-NN)

In [15]:
print('K-Nearest Neighbors Classifier')
knn_p_min = 5
knn_p_max = 15
knn_step = 2
start_time = time.time()
knn_parameter = get_max_accuracy('knn', knn_p_min, knn_p_max, knn_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-neighbors value: ', knn_parameter)
start_time = time.time()
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=knn_parameter)
knn_classifier = knn_classifier.fit(training_data, training_class)
knn_predicted_class = knn_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, knn_predicted_class)


K-Nearest Neighbors Classifier
updated>> i=  5 , accuracy=  0.7664454660719336
updated>> i=  7 , accuracy=  0.7674068905125468
updated>> i=  9 , accuracy=  0.7709313913289367
Not updated>> i=  11 , accuracy=  0.7699692827627727
Not updated>> i=  13 , accuracy=  0.7706109925293491
Not updated>> i=  15 , accuracy=  0.7674074606171725
Cross Validation: 4.404083490371704 seconds
Used n-neighbors value:  9
Classification: 0.5998930931091309 seconds
--- Confusion Matrix ---
TP:  1708 	FP:  640
FN:  298 	TN:  1366
Accuracy =  0.7662013958125623
Precision =  0.727427597955707
Recall =  0.8514456630109671
F-score =  0.7845659163987138


### >> Random Forsets

In [16]:
print('Random Forests Classifier')
rf_p_min = 50
rf_p_max = 100
rf_step = 5
start_time = time.time()
rf_parameter = get_max_accuracy('rf', rf_p_min, rf_p_max, rf_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-estimators value: ', rf_parameter)
start_time = time.time()
rf_classifier = RandomForestClassifier(n_estimators=rf_parameter)
rf_classifier = rf_classifier.fit(training_data, training_class)
rf_predicted_class = rf_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, rf_predicted_class)

Random Forests Classifier
updated>> i=  50 , accuracy=  0.8530556467722956
Not updated>> i=  55 , accuracy=  0.8509198068029444
updated>> i=  60 , accuracy=  0.8533753614463325
Not updated>> i=  65 , accuracy=  0.8523084676499831
updated>> i=  70 , accuracy=  0.8539084092712695
Not updated>> i=  75 , accuracy=  0.853374677320782
updated>> i=  80 , accuracy=  0.8547623119794945
Not updated>> i=  85 , accuracy=  0.851773139406544
Not updated>> i=  90 , accuracy=  0.8545508031633965
Not updated>> i=  95 , accuracy=  0.8526287524286456
Not updated>> i=  100 , accuracy=  0.8524150772149705
Cross Validation: 100.908607006073 seconds
Used n-estimators value:  80
Classification: 2.6469428539276123 seconds
--- Confusion Matrix ---
TP:  1787 	FP:  350
FN:  219 	TN:  1656
Accuracy =  0.8581754735792622
Precision =  0.8362189985961629
Recall =  0.890827517447657
F-score =  0.862659908279025
