# Magic Gamma Telescope

Dataset is from https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope

n = 19020

10 features

- X1.  fLength:  continuous  # major axis of ellipse [mm]
- X2.  fWidth:   continuous  # minor axis of ellipse [mm] 
- X3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]
- X4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]
- X5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]
- X6.  fAsym:    continuous  # distance from highest pixel to center, projected onto major axis [mm]
- X7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm] 
- X8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]
- X9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
- X10.  fDist:    continuous  # distance from origin to center of ellipse [mm]
- Y.  class:    g,h         # gamma (signal), hadron (background)
  - g = gamma (signal):     12332
  - h = hadron (background): 6688

In [4]:
import pandas as pd

In [13]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/magic04.data"
column_names = ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "Y"]
df = pd.read_csv(url, header=None, names=column_names)

In [14]:
df.shape

(19020, 11)

In [15]:
print(df.head())

         X1        X2      X3      X4      X5        X6       X7       X8  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110  -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238  -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580 -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633  -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525  21.8393   

        X9       X10  Y  
0  40.0920   81.8828  g  
1   6.3609  205.2610  g  
2  76.9600  256.7880  g  
3  10.4490  116.7370  g  
4   4.6480  356.4620  g  


In [47]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [26]:
X = df.drop("Y", axis=1)
y = df["Y"]

In [24]:
y.value_counts()

g    12332
h     6688
Name: Y, dtype: int64

## Logistic Regression

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create and train the logistic regression model
#model = LogisticRegression()
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_train_pred)
print("Test Accuracy:", accuracy)
accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.790571328426218
Test Accuracy: 0.7879863301787592


# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [51]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))

    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Create a logistic regression model
        model = LogisticRegression(max_iter=500)
        
        # Select the current batch for training
        X_batch = X_train[start_index:end_index]
        y_batch = y_train[start_index:end_index]
        
        scaler = StandardScaler()
        X_batch_scaled = scaler.fit_transform(X_batch)
        X_test_scaled = scaler.transform(X_test)
        
        # Fit the model on the current batch
        model.fit(X_batch_scaled, y_batch)
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 'h')
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
   
    
    accuracy = np.zeros(3)
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 'h', 'g')
    accuracy[0] = accuracy_score(y_test, final_predictions)
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 'h', 'g')
    accuracy[1] = accuracy_score(y_test, final_predictions)
    
    # Train a model on all 11 batches of training data
    model = LogisticRegression(max_iter=500)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy[2] = accuracy_score(y_test, y_pred)
    
    return accuracy

Try the divide and conquer approaches 10 times, reporting the following error matrix. 
- 10-by-3
- col_1: majority voting
- col_2: average the logit
- col_3: using the model trained on all the training data

In [54]:
# Number of times to repeat the process
num_repeats = 10

# Initialize an empty matrix (10-by-3) to store accuracies
accuracies = np.zeros((num_repeats, 3))

# Repeat the process and store accuracies
for i in range(num_repeats):
    accuracies[i] = iterate_process(X, y)
    

# Print the accuracies
print("Accuracies:", accuracies)

Accuracies: [[0.79206099 0.79179811 0.79100946]
 [0.79035226 0.79100946 0.7904837 ]
 [0.7856204  0.78680336 0.78680336]
 [0.7904837  0.79271819 0.79271819]
 [0.78706625 0.78824921 0.78851209]
 [0.78995794 0.79061514 0.79100946]
 [0.7862776  0.78588328 0.78535752]
 [0.78404311 0.78430599 0.78456887]
 [0.78746057 0.78785489 0.78759201]
 [0.79298107 0.79153523 0.79245531]]


In [33]:

    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    indices = np.random.permutation(len(X))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    

In [52]:
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
iterate_process(X, y)

array([0.79390116, 0.79337539, 0.79311251])