In [68]:
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #for splitting data
from imblearn.over_sampling import RandomOverSampler # for oversamples(gamma and hadrons)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV , KFold# for crossvalidation

Read data

In [28]:
cols = ["fLength" , "fWidth" , "fSize" , "fConc" , "fConc1" , "fAsym" , "fM3Long" , "fM3Trans" , "fAlpha" , "fDist" , "class"]
# turn data csv to data frame object
df = pd.read_csv("magic04.data" , names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [29]:
# g ==> gamma , h ==> hadrons , and convert g ==> 1 & h ==> 0
df["class"].unique()
df["class"] = (df["class"] == 'g').astype(int)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


Split

In [30]:
# use numpy to split data
# first place i will split dataframe at 70% then at 85% , every thing between 70% and 85% will go to ==> validate
train_validate, test = train_test_split(df, test_size=0.15, random_state=42)
train , valid = train_test_split(train_validate, test_size=0.1765, random_state=42)

print("Length of whole data" , len(df))
print ("Train data number " , len(train) )
print ("vVlidation data number " , len(valid) )
print ("Test data number " , len(test) )

Length of whole data 19020
Train data number  13313
vVlidation data number  2854
Test data number  2853


In [31]:
print(len(train[train["class"]==1])) # print number of gammas
print(len(train[train["class"]==0])) # hadrons
# data of gammas is much more ==> so i will use oversampling

8646
4667


In [32]:
# Oversammpling function
# oversample is false for default
def scale_dataset(dataframe, oversample=False):
  # the labels will gonna be the last thing in the dataframe.
  X = dataframe[dataframe.columns[:-1]].values # Features
  y = dataframe[dataframe.columns[-1]].values  # Output

  if oversample:
   ros = RandomOverSampler()
   X, y = ros.fit_resample(X, y)

  # horizentaly stack the data (not on top of each other) ==> the whole data as huge numpy array.
  # I'll use reshape as x is 2D but y is 1d(vector) , -1 == len(y).
  data = np.hstack((X , np.reshape(y , (-1 , 1))))
  
  return data ,X ,y

In [33]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)
# In validate and test data i dont care about oversampling.

 check balance

In [34]:
print(len(y_train))
# train data are equal now
print(sum(y_train == 1))
print(sum(y_train == 0))

17292
8646
8646


Scaling

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#scale train data
X_train_scaled = scaler.fit_transform(X_train)
#scale test data
X_test_scaled = scaler.transform(X_test)

Decision Trees

In [47]:
from sklearn.tree import DecisionTreeClassifier

# Train the decision tree classifier
decision_tree_classifier = DecisionTreeClassifier() # take object
decision_tree_classifier.fit(X_train_scaled, y_train)

y_pred_decision_tree = decision_tree_classifier.predict(X_test_scaled)

# Evaluation
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
precision_decision_tree = precision_score(y_test, y_pred_decision_tree)
recall_decision_tree = recall_score(y_test, y_pred_decision_tree)
f1_score_decision_tree = f1_score(y_test, y_pred_decision_tree)
confusion_matrix_decision_tree = confusion_matrix(y_test, y_pred_decision_tree)
print("Accuracy:" , accuracy_decision_tree)
print("Precision:", precision_decision_tree)
print("Recall:" , recall_decision_tree)
print("F1-score:" , f1_score_decision_tree)
print("Confusion Matrix:")
print(confusion_matrix_decision_tree)

Accuracy: 0.8030143708377147
Precision: 0.8488624052004333
Recall: 0.847027027027027
F1-score: 0.8479437229437229
Confusion Matrix:
[[ 724  279]
 [ 283 1567]]


Naive bayes

In [60]:
from sklearn.naive_bayes import GaussianNB

# Train the Naive Bayes classifier
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train_scaled, y_train)

# Predictions on testing data
y_pred_naive_bayes = naive_bayes_classifier.predict(X_test_scaled)

# Evaluation
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)
precision_naive_bayes = precision_score(y_test, y_pred_naive_bayes)
recall_naive_bayes = recall_score(y_test, y_pred_naive_bayes)
f1_score_naive_bayes = f1_score(y_test, y_pred_naive_bayes)
confusion_matrix_naive_bayes = confusion_matrix(y_test, y_pred_naive_bayes)
print("Accuracy:" , accuracy_naive_bayes)
print("Precision:" , precision_naive_bayes)
print("Recall: ", recall_naive_bayes)
print("F1-score:" , f1_score_naive_bayes)
print("Confusion Matrix:")
print(confusion_matrix_naive_bayes)

Accuracy: 0.7266035751840169
Precision: 0.7358906525573192
Recall:  0.9021621621621622
F1-score: 0.8105876639145216
Confusion Matrix:
[[ 404  599]
 [ 181 1669]]


Adaboost

In [67]:
from sklearn.ensemble import AdaBoostClassifier

#define estimators
param_n_estimators = [50, 100, 200]  # Number of weak learners

#arrays to store metrics
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

#define k-fold cross-validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

#Iterate over each estimators
for n_estimators in param_n_estimators:
    adaboost_classifier = AdaBoostClassifier(n_estimators=n_estimators, algorithm='SAMME')
    fold_accuracy = []
    fold_precision = []
    fold_recall = []
    fold_f1 = []
    
    #Perform k-fold cross-validation on each estimator
    for train_index, test_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[test_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
        
        adaboost_classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = adaboost_classifier.predict(X_val_fold)
        
        fold_accuracy.append(accuracy_score(y_val_fold, y_pred_fold))
        fold_precision.append(precision_score(y_val_fold, y_pred_fold))
        fold_recall.append(recall_score(y_val_fold, y_pred_fold))
        fold_f1.append(f1_score(y_val_fold, y_pred_fold))
    
    #average scores across folds
    accuracy_scores.append(np.mean(fold_accuracy))
    precision_scores.append(np.mean(fold_precision))
    recall_scores.append(np.mean(fold_recall))
    f1_scores.append(np.mean(fold_f1))

#find the best estimator depends on the average accuracy form each fold
best_index = np.argmax(accuracy_scores)
best_n_estimators = param_n_estimators[best_index]

#train the best model
best_adaboost_classifier = AdaBoostClassifier(n_estimators=best_n_estimators)
best_adaboost_classifier.fit(X_train_scaled, y_train)
#Prediction
y_pred_adaboost = best_adaboost_classifier.predict(X_test_scaled)

#evaluation
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
precision_adaboost = precision_score(y_test, y_pred_adaboost)
recall_adaboost = recall_score(y_test, y_pred_adaboost)
f1_score_adaboost = f1_score(y_test, y_pred_adaboost)
confusion_matrix_adaboost = confusion_matrix(y_test, y_pred_adaboost)
print("Best estimators:", best_n_estimators)
print("Accuracy:", accuracy_adaboost)
print("Precision:", precision_adaboost)
print("Recall:", recall_adaboost)
print("F1-score:", f1_score_adaboost)
print("Confusion Matrix:")
print(confusion_matrix_adaboost)


Best n_estimators: 200
Accuracy: 0.8086225026288117
Precision: 0.8862559241706162
Recall: 0.8086486486486486
F1-score: 0.8456755228942906
Confusion Matrix:
[[ 811  192]
 [ 354 1496]]


Random forests

In [71]:
from sklearn.ensemble import RandomForestClassifier

#define classifier (take object)
random_forest_classifier = RandomForestClassifier()

# Define parameter tuning n_estimators
param_grid_random_forest = {'n_estimators': [75, 150, 300]}

#arrays to store metrics
accuracy_random_forest = []
precision_random_forest = []
recall_random_forest = []
f1_score_random_forest = []

#define k-fold cross-validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

#Iterate over each estimators
for n_estimators in param_grid_random_forest['n_estimators']:
    fold_accuracy = []
    fold_precision = []
    fold_recall = []
    fold_f1 = []
    
    #Perform k-fold cross-validation on each estimator
    for train_index, test_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[test_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]
        
        random_forest_classifier = RandomForestClassifier(n_estimators=n_estimators)
        random_forest_classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = random_forest_classifier.predict(X_val_fold)
        
        fold_accuracy.append(accuracy_score(y_val_fold, y_pred_fold))
        fold_precision.append(precision_score(y_val_fold, y_pred_fold))
        fold_recall.append(recall_score(y_val_fold, y_pred_fold))
        fold_f1.append(f1_score(y_val_fold, y_pred_fold))
    
    #average scores across folds
    accuracy_random_forest.append(np.mean(fold_accuracy))
    precision_random_forest.append(np.mean(fold_precision))
    recall_random_forest.append(np.mean(fold_recall))
    f1_score_random_forest.append(np.mean(fold_f1))

#find the best estimator depends on the average accuracy form each fold
best_index = np.argmax(accuracy_random_forest)
best_n_estimators = param_grid_random_forest['n_estimators'][best_index]

#train the best model
best_random_forest_classifier = RandomForestClassifier(n_estimators=best_n_estimators)
best_random_forest_classifier.fit(X_train_scaled, y_train)

#Predictions
y_pred_random_forest = best_random_forest_classifier.predict(X_test_scaled)

#Evaluation
accuracy_random_forest = accuracy_score(y_test, y_pred_random_forest)
precision_random_forest = precision_score(y_test, y_pred_random_forest)
recall_random_forest = recall_score(y_test, y_pred_random_forest)
f1_score_random_forest = f1_score(y_test, y_pred_random_forest)
confusion_matrix_random_forest = confusion_matrix(y_test, y_pred_random_forest)
print("Best n_estimators:", best_n_estimators)
print("Random Forest Classifier Metrics:")
print("Accuracy:", accuracy_random_forest)
print("Precision:", precision_random_forest)
print("Recall:", recall_random_forest)
print("F1-score:", f1_score_random_forest)
print("Confusion Matrix:")
print(confusion_matrix_random_forest)


Best n_estimators: 300
Random Forest Classifier Metrics:
Accuracy: 0.8741675429372591
Precision: 0.886870783601453
Recall: 0.9237837837837838
F1-score: 0.9049510193275087
Confusion Matrix:
[[ 785  218]
 [ 141 1709]]


Conclusion

In [None]:
# from f1 classifier using Random forests , then adaboost (with tuning parameters) is the best