# Data Classification

### Imports

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import metrics, tree, neighbors
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

### Reading dataset

In [None]:
# read data from file
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data"
headers = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'Class']
dataset = pd.read_csv(dataset_url, sep=',', names=headers)
# display(dataset)

### Data Balancing

In [None]:
# Data Balancing
h_samples_balanced = dataset[dataset['Class'] == 'h']

# select 6688 random samples with class = g
g_class_samples = dataset[dataset['Class'] == 'g']
g_samples_balanced = g_class_samples.sample(n = 6688)

# display(g_samples)
# display(h_samples)

### Data Split

In [None]:
# Data Split
# split the g class samples to 70% and 30% for training & testing respectively 
g_70_percent = g_samples_balanced.sample(frac = 0.7)
g_30_percent = g_samples_balanced.drop(g_70_percent.index)

# split the h class samples to 70% and 30% for training & testing respectively 
h_70_percent = h_samples_balanced.sample(frac = 0.7)
h_30_percent = h_samples_balanced.drop(h_70_percent.index)

# concatenating the 70% of g-class and h-class to form the training set
training_set = pd.concat([g_70_percent, h_70_percent], axis=0, ignore_index=True)

# concatenating the 30% of g-class and h-class to form the testing set
testing_set = pd.concat([g_30_percent, h_30_percent], axis=0, ignore_index=True)

# display(training_set)
# display(testing_set)

training_data = training_set.iloc[:,:-1]
training_class = training_set.iloc[:,-1]
# display(training_data)
# display(training_class)

testing_data = testing_set.iloc[:,:-1]
testing_class = testing_set.iloc[:,-1]
# display(testing_data)
# display(testing_class)

### Classification

#### >> Confusion matrix calculation

In [None]:
# 'g' >> positive ,, 'h' >> negative
def conf_matrix_calculations(actual, predicted):
    TP = TN = FP = FN = 0
    training_dataset_size = 4012
    for i in range (training_dataset_size):
        if actual[i] == 'g' and predicted[i] == 'g':
            TP += 1
        elif actual[i] == 'g' and predicted[i] == 'h':
            FN += 1
        elif actual[i] == 'h' and predicted[i] == 'h':
            TN += 1
        elif actual[i] == 'h' and predicted[i] == 'g':
            FP += 1
    return TP, FP, TN, FN

#### >> Cross Validation

In [None]:
def get_max_accuracy(classifier_type, p_min, p_max, dataset_data, dataset_class):
    max_accuracy = 0
    p_max_acc = p_min
    for i in range (p_min, p_max + 1):
        if classifier_type == 'knn':
            classifier = neighbors.KNeighborsClassifier(i)
        elif classifier_type == 'ada':
            classifier = AdaBoostClassifier(n_estimators=i)
        elif classifier_type == 'rf':
            classifier = RandomForestClassifier(n_estimators=i)
        accuracy = cross_validation(classifier, dataset_data, dataset_class)
        if max_accuracy < accuracy:
            max_accuracy = accuracy
            p_max_acc = i
            print('updated>> i= ', i, ', accuracy= ', accuracy)
    return p_max_acc

In [None]:
def cross_validation(classifier, dataset_data, dataset_class):
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scores = cross_val_score(classifier, dataset_data, dataset_class, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

#### >> Decision Trees

In [None]:
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(training_data, training_class)
dt_predicted_class = dt_classifier.predict(testing_data)

# testing_class >> actual ,, dt_predicted_class >> predicted
# for confusion matrix
dt_TP, dt_FP, dt_TN, dt_FN = conf_matrix_calculations(testing_class, dt_predicted_class)
print('Using Decision Tree Classifier:')
print('TP:', dt_TP) 
print('FP:', dt_FP) 
print('FN:', dt_FN) 
print('TN:', dt_TN) 

#### >> AdaBoost

In [None]:
ada_parameter = get_max_accuracy('ada', 1, 50, training_data, training_class)
ada_classifier = AdaBoostClassifier(n_estimators=ada_parameter)
ada_classifier = ada_classifier.fit(training_data, training_class)
ada_predicted_class = ada_classifier.predict(testing_data)

ada_TP, ada_FP, ada_TN, ada_FN = conf_matrix_calculations(testing_class, ada_predicted_class)
print('Using AdaBoost Classifier:')
print('TP:', ada_TP)
print('FP:', ada_FP)
print('FN:', ada_FN)
print('TN:', ada_TN)

#### >> K-Nearest Neighbors (K-NN)

In [None]:
knn_parameter = get_max_accuracy('knn', 1, 13, training_data, training_class)
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=knn_parameter)
knn_classifier = knn_classifier.fit(training_data, training_class)
knn_predicted_class = knn_classifier.predict(testing_data)

knn_TP, knn_FP, knn_TN, knn_FN = conf_matrix_calculations(testing_class, knn_predicted_class)
print('Using K-Nearest Neighbors Classifier:')
print('TP:', knn_TP) 
print('FP:', knn_FP) 
print('FN:', knn_FN) 
print('TN:', knn_TN) 

#### >> Random Forsets

In [None]:
rf_parameter = get_max_accuracy('rf', 1, 30, training_data, training_class)
rf_classifier = RandomForestClassifier(n_estimators=rf_parameter)
rf_classifier = rf_classifier.fit(training_data, training_class)
rf_predicted_class = rf_classifier.predict(testing_data)

rf_TP, rf_FP, rf_TN, rf_FN = conf_matrix_calculations(testing_class, rf_predicted_class)
print('Using Random Forests Classifier:')
print('TP:', rf_TP) 
print('FP:', rf_FP) 
print('FN:', rf_FN) 
print('TN:', rf_TN) 

#### >> Naive Bayes

In [None]:
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(training_data, training_class)
nb_predicted_class = nb_classifier.predict(testing_data)

# testing_class >> actual ,, nb_predicted_class >> predicted
# for confusion matrix
nb_TP, nb_FP, nb_TN, nb_FN = conf_matrix_calculations(testing_class, nb_predicted_class)
print('Using Naive Bayes Classifier:')
print('TP:', nb_TP) 
print('FP:', nb_FP) 
print('FN:', nb_FN) 
print('TN:', nb_TN) 