# Magic Gamma Telescope

Dataset is from https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope

n = 19020

10 features

- X1.  fLength:  continuous  # major axis of ellipse [mm]
- X2.  fWidth:   continuous  # minor axis of ellipse [mm] 
- X3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]
- X4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]
- X5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]
- X6.  fAsym:    continuous  # distance from highest pixel to center, projected onto major axis [mm]
- X7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm] 
- X8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]
- X9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
- X10.  fDist:    continuous  # distance from origin to center of ellipse [mm]
- Y.  class:    g,h         # gamma (signal), hadron (background)
  - g = gamma (signal):     12332
  - h = hadron (background): 6688

In [19]:
import pandas as pd

In [20]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/magic04.data"
column_names = ["X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "Y"]
df = pd.read_csv(url, header=None, names=column_names)

In [21]:
from sklearn import svm
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [22]:
X = df.drop("Y", axis=1)
y = df["Y"]

In [23]:
y = y.replace({'g' : 1, 'h' : 0})
y.value_counts()

1    12332
0     6688
Name: Y, dtype: int64

# Divide and Conquer

Divide the training data into 11 batches, train a logistic model on each of the batch, and then combine the 11 prediction results. Consider the following two ensemble methods:
- majority voting
- average (or sum) of the logit output and then make decision based on its sign

In [24]:
def iterate_process(X, y):
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    
    # Define the number of batches
    num_batches = 11
    
    # Randomly shuffle the data indices
    #indices = np.random.permutation(len(X))
    
    #change 7/12
    indices = np.random.permutation(len(X_train))
    
    # Calculate the batch size
    batch_size = len(X_train) // num_batches
    
    # Make predictions on the test set using majority voting
    preds_voting = np.zeros(len(y_test))
    # Make predictions on the test set using average of logit
    preds_logit = np.zeros(len(y_test))
    #Make predictions on the test set using average of probs
    preds_prob = np.zeros(len(y_test))
    
    preds_voting_weighted = np.zeros(len(y_test))
    preds_logit_weighted = np.zeros(len(y_test))
    preds_prob_weighted = np.zeros(len(y_test))
    
    total_cverr = 0
    
    # Split the training data into batches, fit a logistic regression model on each batch
    for i in range(num_batches):
        # Calculate the starting and ending indices for the current batch
        start_index = i * batch_size
        end_index = (i + 1) * batch_size
        
        # Select the current batch for training
        #X_batch = X_train[start_index:end_index]
        #y_batch = y_train[start_index:end_index]
        
        #change 7/12
        X_batch = X_train.iloc[indices[start_index:end_index]]
        y_batch = y_train.iloc[indices[start_index:end_index]]
        
        #scaler = StandardScaler()
        X_batch_scaled = X_batch
        X_test_scaled = X_test
        
        # Fit the model on the current batch
        model = sm.GLM(family = sm.families.Binomial())
        model.fit(y_batch, X_batch_scaled)
        current_cverr = cross_val_score(model, X_batch_scaled, y_batch, cv = 5, scoring = 'accuracy').mean()
        total_cverr += current_cverr
               
        y_pred = model.predict(X_test_scaled)
        # Accumulate the predictions using majority voting
        preds_voting += (y_pred == 1)
        preds_voting_weighted += (y_pred == 1) * current_cverr
    
        # Accumulate the predictions using majority voting
        y_pred = model.decision_function(X_test_scaled)
        preds_logit += y_pred
        preds_logit_weighted += y_pred * current_cverr
        
        #Accumulate the probs
        y_pred = model.predict_proba(X_test_scaled)
        preds_prob += y_pred[:,1]
        preds_prob_weighted += y_pred[:,1] * current_cverr
    
    accuracy = np.zeros(7)
    auc_accuracy = np.zeros(7)
    
    preds_voting_weighted = preds_voting_weighted / total_cverr * num_batches
    preds_logit_weighted = preds_logit_weighted / total_cverr * num_batches
    preds_prob_weighted = preds_prob_weighted / total_cverr * num_batches
    
    # Majority voting (selecting the most frequent prediction for each sample)
    final_predictions = np.where(preds_voting > num_batches / 2, 1, 0)
    accuracy[0] = accuracy_score(y_test, final_predictions)
    auc_accuracy[0] = roc_auc_score(y_test, preds_voting)
    
    final_predictions = np.where(preds_voting_weighted > num_batches / 2, 1, 0)
    accuracy[1] = accuracy_score(y_test, final_predictions)
    auc_accuracy[1] = roc_auc_score(y_test, preds_voting_weighted)
    
    # Average of logit
    final_predictions = np.where(preds_logit > 0, 1, 0)
    accuracy[2] = accuracy_score(y_test, final_predictions)
    auc_accuracy[2] = roc_auc_score(y_test, preds_logit)
    
    final_predictions = np.where(preds_logit_weighted > 0, 1, 0)
    accuracy[3] = accuracy_score(y_test, final_predictions)
    auc_accuracy[3] = roc_auc_score(y_test, preds_logit_weighted)
    
    #Average of probs
    final_predictions = np.where(preds_prob / num_batches > 0.5, 1, 0)
    accuracy[4] = accuracy_score(y_test, final_predictions)
    auc_accuracy[4] = roc_auc_score(y_test, preds_prob)
    
    final_predictions = np.where(preds_prob_weighted / num_batches > 0.5, 1, 0)
    accuracy[5] = accuracy_score(y_test, final_predictions)
    auc_accuracy[5] = roc_auc_score(y_test, preds_prob_weighted)
    
    # Train a model on all 11 batches of training data
    model = sm.GLM(family = sm.families.Binomial())
    
    #scaler = StandardScaler()
    X_train_scaled = X_train
    X_test_scaled = X_test
    #model.fit(X_train_scaled, y_train)
    model.fit(y_train, X_train_scaled)
    y_pred = model.predict(X_test_scaled)
    accuracy[6] = accuracy_score(y_test, y_pred)
    y_pred = model.decision_function(X_test_scaled)
    auc_accuracy[6] = roc_auc_score(y_test, y_pred)
    
    return accuracy, auc_accuracy

Try the divide and conquer approaches 10 times, reporting the following error matrix. 
- 10-by-4
- col_1: majority voting
- col_2: average the logit
- col_3: average probabilities
- col_4: using the model trained on all the training data

In [25]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

TypeError: GLM.__init__() missing 2 required positional arguments: 'endog' and 'exog'

In [None]:
np.mean(accuracies, axis = 0) 

In [None]:
np.std(accuracies, axis = 0)

In [None]:
np.mean(auc_accuracies, axis = 0)

In [26]:
np.std(auc_accuracies, axis = 0)

array([0., 0., 0., 0., 0., 0., 0.])

# Wireless Indoor Localization

In [13]:
url = "https://raw.githubusercontent.com/maxxxxc/SIR-Summer-2023/main/dataset/wifi_localization.txt"
df = pd.read_csv(url, sep = '\t', header = None)

X = df.drop(7, axis=1)
y = df[7]

y = y.replace({2 : 0, 1 : 0, 3: 1, 4 : 1})
y.value_counts()

0    1000
1    1000
Name: 7, dtype: int64

In [14]:
# Number of times to repeat the process
num_repeats = 50

# Initialize an empty matrix (10-by-4) to store accuracies
accuracies = np.zeros((num_repeats, 7))
auc_accuracies = np.zeros((num_repeats, 7))

seed = 42
# Repeat the process and store accuracies
for i in range(num_repeats):
    np.random.seed(seed)
    accuracies[i], auc_accuracies[i] = iterate_process(X, y)
    seed += 2
    

# Print the accuracies
print("Accuracies:", accuracies)
print("AUC:", auc_accuracies)

Accuracies: [[0.92875 0.92875 0.93    0.93    0.93    0.93    0.93375]
 [0.915   0.915   0.9175  0.9175  0.91875 0.9175  0.90625]
 [0.9275  0.9275  0.93125 0.93125 0.92875 0.9275  0.9325 ]
 [0.9175  0.9175  0.9175  0.9175  0.9175  0.9175  0.9175 ]
 [0.91    0.91    0.91875 0.91875 0.915   0.91375 0.92   ]
 [0.9225  0.9225  0.92125 0.9225  0.925   0.925   0.915  ]
 [0.9275  0.9275  0.925   0.92375 0.92375 0.92375 0.9225 ]
 [0.9175  0.9175  0.92125 0.92125 0.9225  0.9225  0.9225 ]
 [0.93625 0.93625 0.93875 0.93875 0.93875 0.93875 0.94   ]
 [0.92375 0.92375 0.92875 0.92875 0.925   0.92625 0.92125]
 [0.92375 0.92375 0.92375 0.9225  0.9225  0.9225  0.9225 ]
 [0.9175  0.9175  0.915   0.915   0.91625 0.91625 0.9175 ]
 [0.92    0.92    0.91875 0.91875 0.92125 0.92125 0.9275 ]
 [0.9325  0.9325  0.93875 0.93875 0.93875 0.93875 0.93   ]
 [0.93    0.93    0.93    0.93125 0.9325  0.9325  0.9275 ]
 [0.91375 0.91375 0.91375 0.91625 0.9125  0.9125  0.91375]
 [0.92625 0.92625 0.92125 0.92125 0.92    0.

In [15]:
np.mean(accuracies, axis = 0)

array([0.9229  , 0.9229  , 0.92355 , 0.923525, 0.923325, 0.9233  ,
       0.9231  ])

In [16]:
np.std(accuracies, axis = 0)

array([0.00729486, 0.00729486, 0.00697029, 0.00688608, 0.00692193,
       0.00695414, 0.00688858])

In [17]:
np.mean(auc_accuracies, axis = 0)

array([0.9679879 , 0.96798493, 0.98029316, 0.98028103, 0.98044895,
       0.98044083, 0.98062023])

In [18]:
np.std(auc_accuracies, axis = 0)

array([0.00481803, 0.00485437, 0.00250549, 0.00251837, 0.0025293 ,
       0.00253446, 0.00233349])

#