# Data Classification

## Imports

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import sklearn as sk
import time
from sklearn import metrics, tree, neighbors
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Reading dataset

In [2]:
# read data from file
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data"
headers = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'Class']
dataset = pd.read_csv(dataset_url, sep=',', names=headers)
# display(dataset)

## Data Balancing

In [3]:
# Data Balancing
h_samples_balanced = dataset[dataset['Class'] == 'h']

# select 6688 random samples with class = g
g_class_samples = dataset[dataset['Class'] == 'g']
g_samples_balanced = g_class_samples.sample(n = 6688)

# display(g_samples)
# display(h_samples)

## Data Split

In [4]:
# Data Split
# split the g class samples to 70% and 30% for training & testing respectively 
g_70_percent = g_samples_balanced.sample(frac = 0.7)
g_30_percent = g_samples_balanced.drop(g_70_percent.index)

# split the h class samples to 70% and 30% for training & testing respectively 
h_70_percent = h_samples_balanced.sample(frac = 0.7)
h_30_percent = h_samples_balanced.drop(h_70_percent.index)

# concatenating the 70% of g-class and h-class to form the training set
training_set = pd.concat([g_70_percent, h_70_percent], axis=0, ignore_index=True)

# concatenating the 30% of g-class and h-class to form the testing set
testing_set = pd.concat([g_30_percent, h_30_percent], axis=0, ignore_index=True)

# display(training_set)
# display(testing_set)

training_data = training_set.iloc[:,:-1]
training_class = training_set.iloc[:,-1]
# display(training_data)
# display(training_class)

testing_data = testing_set.iloc[:,:-1]
testing_class = testing_set.iloc[:,-1]
# display(testing_data)
# display(testing_class)

## Classification

### >> Confusion matrix calculation

In [5]:
# 'g' >> positive ,, 'h' >> negative
def conf_matrix_calculations(actual, predicted):
    TP = TN = FP = FN = 0
    training_dataset_size = 4012
    for i in range (training_dataset_size):
        if actual[i] == 'g' and predicted[i] == 'g':
            TP += 1
        elif actual[i] == 'g' and predicted[i] == 'h':
            FN += 1
        elif actual[i] == 'h' and predicted[i] == 'h':
            TN += 1
        elif actual[i] == 'h' and predicted[i] == 'g':
            FP += 1
    print('--- Confusion Matrix ---')
    print('TP: ', TP, '\tFP: ', FP)
    print('FN: ', FN, '\tTN: ', TN)
    print('Accuracy = ', accuracy(TP, FP, FN, TN))
    p = precision(TP, FP)
    print('Precision = ', p)
    r = recall(TP, FN)
    print('Recall = ', r)
    print('F-score = ', f_score(p, r))
    

In [6]:
def accuracy(TP, FP, FN, TN):
    return (TP + TN) / (TP + FP + FN + TN)

In [7]:
def precision(TP, FP):
    return (TP) / (TP + FP)

In [8]:
def recall(TP, FN):
    return (TP) / (TP + FN)

In [9]:
def f_score(p, r):
    return (2 * p * r) / (p + r)

### >> Cross Validation

In [10]:
def get_max_accuracy(classifier_type, p_min, p_max, step, dataset_data, dataset_class):
    max_accuracy = 0
    p_max_acc = p_min
    for i in range (p_min, p_max + 1, step):
        if classifier_type == 'knn':
            classifier = neighbors.KNeighborsClassifier(i)
        elif classifier_type == 'ada':
            classifier = AdaBoostClassifier(n_estimators=i)
        elif classifier_type == 'rf':
            classifier = RandomForestClassifier(n_estimators=i)
        accuracy = cross_validation(classifier, dataset_data, dataset_class)
        if max_accuracy < accuracy:
            max_accuracy = accuracy
            p_max_acc = i
#             print('updated>> i= ', i, ', accuracy= ', accuracy)
#         else:
#             print('Not updated>> i= ', i, ', accuracy= ', accuracy)
    return p_max_acc

In [11]:
def cross_validation(classifier, dataset_data, dataset_class):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(classifier, dataset_data, dataset_class, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

### >> Decision Trees

In [12]:
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(training_data, training_class)
dt_predicted_class = dt_classifier.predict(testing_data)

print('Decision Tree Classifier')
conf_matrix_calculations(testing_class, dt_predicted_class)

Decision Tree Classifier
--- Confusion Matrix ---
TP:  1595 	FP:  401
FN:  411 	TN:  1605
Accuracy =  0.7976071784646062
Precision =  0.7990981963927856
Recall =  0.7951146560319042
F-score =  0.7971014492753623


### >> Naive Bayes

In [13]:
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(training_data, training_class)
nb_predicted_class = nb_classifier.predict(testing_data)

print('Naive Bayes Classifier')
conf_matrix_calculations(testing_class, nb_predicted_class)


Naive Bayes Classifier
--- Confusion Matrix ---
TP:  1792 	FP:  1185
FN:  214 	TN:  821
Accuracy =  0.651296111665005
Precision =  0.6019482700705409
Recall =  0.8933200398803589
F-score =  0.7192454344772226


### >> AdaBoost

In [14]:
print('AdaBoost Classifier')
ada_p_min = 95
ada_p_max = 110
ada_step = 2
start_time = time.time()
ada_parameter = get_max_accuracy('ada',ada_p_min, ada_p_max, ada_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-estimators value: ', ada_parameter)
start_time = time.time()
ada_classifier = AdaBoostClassifier(n_estimators=ada_parameter)
ada_classifier = ada_classifier.fit(training_data, training_class)
ada_predicted_class = ada_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, ada_predicted_class)


AdaBoost Classifier
Cross Validation: 122.63887190818787 seconds
Used n-estimators value:  105
Classification: 2.7740225791931152 seconds
--- Confusion Matrix ---
TP:  1675 	FP:  349
FN:  331 	TN:  1657
Accuracy =  0.8305084745762712
Precision =  0.8275691699604744
Recall =  0.8349950149551346
F-score =  0.8312655086848635


### >> K-Nearest Neighbors (K-NN)

In [15]:
print('K-Nearest Neighbors Classifier')
knn_p_min = 5
knn_p_max = 15
knn_step = 2
start_time = time.time()
knn_parameter = get_max_accuracy('knn', knn_p_min, knn_p_max, knn_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-neighbors value: ', knn_parameter)
start_time = time.time()
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=knn_parameter)
knn_classifier = knn_classifier.fit(training_data, training_class)
knn_predicted_class = knn_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, knn_predicted_class)


K-Nearest Neighbors Classifier
Cross Validation: 6.01399564743042 seconds
Used n-neighbors value:  15
Classification: 0.49268293380737305 seconds
--- Confusion Matrix ---
TP:  1724 	FP:  645
FN:  282 	TN:  1361
Accuracy =  0.7689431704885344
Precision =  0.7277332207682566
Recall =  0.8594217347956131
F-score =  0.7881142857142857


### >> Random Forsets

In [16]:
print('Random Forests Classifier')
rf_p_min = 50
rf_p_max = 100
rf_step = 5
start_time = time.time()
rf_parameter = get_max_accuracy('rf', rf_p_min, rf_p_max, rf_step, training_data, training_class)
print("Cross Validation: %s seconds"  % (time.time() - start_time))
print('Used n-estimators value: ', rf_parameter)
start_time = time.time()
rf_classifier = RandomForestClassifier(n_estimators=rf_parameter)
rf_classifier = rf_classifier.fit(training_data, training_class)
rf_predicted_class = rf_classifier.predict(testing_data)
print("Classification: %s seconds"  % (time.time() - start_time))

conf_matrix_calculations(testing_class, rf_predicted_class)

Random Forests Classifier
Cross Validation: 157.61438131332397 seconds
Used n-estimators value:  95
Classification: 2.922390937805176 seconds
--- Confusion Matrix ---
TP:  1770 	FP:  309
FN:  236 	TN:  1697
Accuracy =  0.8641575274177468
Precision =  0.8513708513708513
Recall =  0.8823529411764706
F-score =  0.8665850673194614
