<a href="https://colab.research.google.com/github/martharegina/machine-learning/blob/main/fcc_MAGIC_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Dataset
Bock, R. (2004). MAGIC Gamma Telescope [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C52C8B.

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("magic04.data", names = cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [None]:
df['class'] = (df['class'] == 'g').astype(int)

In [None]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


# Preparing Data

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

  return bound(*args, **kwds)


In [None]:
train

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
15818,22.1614,15.3687,2.7987,0.2914,0.1358,11.4247,-10.3657,-20.4911,13.5920,141.7225,0
18080,18.4914,9.7635,2.4829,0.6579,0.3734,-1.8060,7.6520,6.7260,33.8161,188.8670,0
14279,141.9091,34.0649,3.2953,0.2417,0.1856,-200.6945,-23.6763,20.0167,36.9793,325.0043,0
12768,17.4001,12.2241,2.1761,0.6983,0.4177,1.6054,4.1679,7.7922,28.9819,46.7294,0
11358,18.1313,7.4821,2.1746,0.7759,0.5117,-1.2575,4.6898,-2.9614,33.8342,182.8880,1
...,...,...,...,...,...,...,...,...,...,...,...
16306,137.1550,60.4155,3.4747,0.0969,0.0498,-68.4920,-115.4150,66.0016,24.9674,93.6126,0
10889,24.7808,16.3039,2.5938,0.4586,0.2739,19.5027,25.0490,-10.7515,10.7326,138.1490,1
15238,147.1506,107.5474,4.1057,0.1186,0.0397,29.5201,66.0310,99.9023,78.6086,382.2958,0
7735,93.0600,26.3056,3.8134,0.1421,0.0746,64.4540,72.1375,-20.4546,0.5580,261.1790,1


In [None]:
valid

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
13571,56.5912,29.0345,3.0775,0.2298,0.1248,-25.5512,-9.6603,17.2837,43.3511,268.6244,0
16535,15.8646,7.6278,2.5200,0.5872,0.3069,-19.3034,14.7897,-10.1066,7.6553,188.0232,0
1505,25.2213,12.6447,2.4409,0.4348,0.2591,-17.8855,-16.3947,-6.3854,7.2470,150.2870,1
17998,14.3539,9.1943,2.4770,0.8392,0.5497,-10.1834,-8.5917,-6.7582,49.0790,142.0123,0
9257,26.3469,16.3506,2.6893,0.3701,0.1871,34.2159,21.3861,-7.0471,11.3556,142.1280,1
...,...,...,...,...,...,...,...,...,...,...,...
2050,81.7537,22.9181,2.9741,0.3577,0.1980,-26.1529,-66.9286,-11.1344,0.2053,359.0740,1
2203,41.6067,10.5486,2.4639,0.4364,0.2251,-7.2166,-31.2745,-5.2903,8.1700,124.7300,1
13339,103.0710,30.1342,3.3185,0.2142,0.1117,-100.6860,64.7012,23.9835,20.3454,286.0720,0
2965,32.8254,19.3483,2.7892,0.3038,0.1649,13.5131,-12.1285,10.2888,37.5250,177.1840,1


In [None]:
test

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
8666,17.3282,10.9182,2.1703,0.5946,0.3547,1.0210,-5.5946,6.8793,82.2683,184.2990,1
16383,108.8780,26.4940,2.5521,0.3450,0.1865,-104.0760,51.1615,18.1802,22.6807,291.1320,0
17206,28.2374,16.2718,2.3720,0.4416,0.2272,-27.5441,-24.6387,-11.7319,40.3891,172.7860,0
7043,39.9972,18.2776,3.0765,0.2784,0.1572,35.5163,36.3041,10.3638,0.8510,261.9510,1
14369,54.0843,18.7006,2.6594,0.5667,0.3720,17.0846,63.9990,-11.8984,41.7125,222.4798,0
...,...,...,...,...,...,...,...,...,...,...,...
14857,60.0815,23.0309,2.7255,0.2239,0.1214,18.4052,40.3108,-3.5778,82.9643,90.9217,0
9936,36.8172,15.6980,2.6123,0.3126,0.1746,27.1143,19.0649,9.4098,22.2590,182.5500,1
3156,28.3143,11.3820,2.4440,0.4101,0.2212,-10.3046,-9.2940,-7.2713,10.6780,199.5320,1
18589,59.1493,19.1074,2.8940,0.2897,0.1563,-49.1153,-32.1387,-13.5766,42.7620,197.4300,0


In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
print(len(train[train['class']==1]))
print(len(train[train['class']==0]))

7376
4036


In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid , y_valid = scale_dataset(valid, oversample=False)
test, X_test , y_test = scale_dataset(test, oversample=False)

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71      1279
           1       0.84      0.88      0.86      2525

    accuracy                           0.81      3804
   macro avg       0.79      0.78      0.79      3804
weighted avg       0.81      0.81      0.81      3804

