# Data Classification

### Imports

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import metrics

### Reading dataset

In [2]:
# read data from file
dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data"
headers = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'Class']
dataset = pd.read_csv(dataset_url, sep=',', names=headers)
# display(dataset)

### Data Balancing

In [3]:
# Data Balancing
h_samples_balanced = dataset[dataset['Class'] == 'h']

# select 6688 random samples with class = g
g_class_samples = dataset[dataset['Class'] == 'g']
g_samples_balanced = g_class_samples.sample(n = 6688)

# display(g_samples)
# display(h_samples)

### Data Split

In [22]:
# Data Split
# split the g class samples to 70% and 30% for training & testing respectively 
g_70_percent = g_samples_balanced.sample(frac = 0.7)
g_30_percent = g_samples_balanced.drop(g_70_percent.index)

# split the h class samples to 70% and 30% for training & testing respectively 
h_70_percent = h_samples_balanced.sample(frac = 0.7)
h_30_percent = h_samples_balanced.drop(h_70_percent.index)

# concatenating the 70% of g-class and h-class to form the training set
training_set = pd.concat([g_70_percent, h_70_percent], axis=0, ignore_index=True)

# concatenating the 30% of g-class and h-class to form the testing set
testing_set = pd.concat([g_30_percent, h_30_percent], axis=0, ignore_index=True)

# display(training_set)
# display(testing_set)

training_data = training_set.iloc[:,:-1]
training_class = training_set.iloc[:,-1]
# display(training_data)
# display(training_class)

testing_data = testing_set.iloc[:,:-1]
testing_class = testing_set.iloc[:,-1]
# display(testing_data)
# display(testing_class)

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,Class
0,39.5766,15.6353,2.5065,0.5327,0.2913,18.5136,36.2208,9.6697,8.6074,127.6930,g
1,41.3260,15.8124,2.6479,0.3127,0.1609,-15.2693,19.7499,-7.4930,13.4291,36.1810,g
2,117.3060,24.6754,2.9445,0.2886,0.1563,116.2220,90.6799,-29.0852,0.0969,342.7120,g
3,75.5326,29.0068,3.2583,0.3608,0.2232,34.4086,-47.6490,21.1596,2.4050,365.0450,g
4,28.4641,9.4391,2.2730,0.4533,0.2533,-40.3157,15.9289,6.6576,63.5110,50.8947,g
...,...,...,...,...,...,...,...,...,...,...,...
9359,41.6580,14.2577,2.5695,0.3673,0.2081,4.1599,38.6242,8.0686,51.8428,228.2805,h
9360,199.0230,86.2958,3.8184,0.1279,0.0662,-102.3000,-270.9360,-36.5230,4.8536,354.8260,h
9361,102.0137,16.8966,2.5624,0.4903,0.2540,-109.2418,46.6447,-5.9363,13.2785,237.2388,h
9362,114.1760,96.2985,2.9388,0.4571,0.3057,-49.8439,-129.5150,104.7810,60.2210,412.6360,h


Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,Class
0,30.7392,19.9317,2.8254,0.2855,0.1502,-0.5503,-8.1904,14.3208,10.8862,160.3060,g
1,12.8944,11.3551,2.0917,0.7368,0.4089,14.9833,-7.2857,-10.7610,58.0350,89.5326,g
2,58.8813,27.5643,3.3251,0.1272,0.0636,23.5287,-29.2176,-6.3803,6.4020,150.4130,g
3,23.8641,22.2821,2.7336,0.3546,0.1930,26.6777,20.5925,15.1969,39.8008,136.0960,g
4,65.8244,18.8813,3.0978,0.2028,0.1066,-41.7522,53.5763,-12.4085,8.0310,218.9710,g
...,...,...,...,...,...,...,...,...,...,...,...
4007,172.2442,20.1926,2.9009,0.2887,0.1940,124.5990,112.9452,-21.4436,28.9553,210.8496,h
4008,110.7140,26.8312,3.2212,0.1856,0.1247,-158.0625,73.2259,21.2864,9.3725,281.8795,h
4009,32.1454,13.8296,2.4844,0.5182,0.2761,-36.0633,-15.9648,-12.2698,47.3704,246.0565,h
4010,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,h


### Classification

#### >> Confusion matrix calculation

In [24]:
# 'g' >> positive ,, 'h' >> negative
def conf_matrix_calculations(actual, predicted):
    TP = TN = FP = FN = 0
    for i in range (4012):
        if actual[i] == 'g' and predicted[i] == 'g':
            TP += 1
        elif actual[i] == 'g' and predicted[i] == 'h':
            FN += 1
        elif actual[i] == 'h' and predicted[i] == 'h':
            TN += 1
        elif actual[i] == 'h' and predicted[i] == 'g':
            FP += 1
    return TP, FP, TN, FN

#### >> Cross Validation

#### >> Decision Trees

In [27]:
from sklearn import tree
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(training_data, training_class)
dt_predicted_class = dt_classifier.predict(testing_data)

# testing_class >> actual ,, dt_predicted_class >> predicted
# for confusion matrix
dt_TP, dt_FP, dt_TN, dt_FN = conf_matrix_calculations(testing_class, dt_predicted_class)
print('Using Decision Tree Classifier:')
print('TP:', dt_TP) 
print('FP:', dt_FP) 
print('FN:', dt_FN) 
print('TN:', dt_TN) 

Using Decision Tree Classifier:
TP: 1591
FP: 464
FN: 415
TN: 1542


#### >> AdaBoost

#### >> K-Nearest Neighbors (K-NN)

#### >> Random Forsets

#### >> Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(training_data, training_class)
nb_predicted_class = nb_classifier.predict(testing_data)

# testing_class >> actual ,, nb_predicted_class >> predicted
# for confusion matrix
nb_TP, nb_FP, nb_TN, nb_FN = conf_matrix_calculations(testing_class, nb_predicted_class)
print('Using Naive Bayes Classifier:')
print('TP:', nb_TP) 
print('FP:', nb_FP) 
print('FN:', nb_FN) 
print('TN:', nb_TN) 

Using Naive Bayes Classifier:
TP: 1795
FP: 1166
FN: 211
TN: 840
