In [1]:
# Install Scikit Learning Lib
!pip install -U scikit-learn --target=/kaggle/working/

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/3f/61/047b353f0ad550226ef962da182b4a09b689eb6df6bd84a03e44f9ee95bb/scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2.0,>=1.19.5 (from scikit-learn)
  Obtaining dependency information for numpy<2.0,>=1.19.5 from https://files.pythonhosted.org/packages/a5/37/d1453c9ff4f7630e68ec036c6fb56ba0d7c769daa8a4083cb4ef8ee45995/numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency informa

In [2]:
# Import Pytorch and verify that we have GPU 
import torch

# Make use of a GPU or MPS (Apple) if one is available.
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
import copy
import time
import numpy as np
import os
import pandas as pd
import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset

# Set random seed for reproducibility
np.random.seed(69)
torch.manual_seed(69)

# Class from Jeff Heaton on early stopping
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_model = copy.deepcopy(model.state_dict())
            self.best_loss = val_loss
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping triggered after {self.counter} epochs."
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False
    
# Load the MC data and process it into tensors to be used by NN
# modeled after Jeff Heaton's function
def load_data():
    # Read in data from Kaggle input directory
    data = []
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            data.append(os.path.join(dirname, filename))
            
    # Read in the data into dataframes
    df_list = []
    for i in data:
        df_list.append(pd.read_csv(i))
    
    # Remove last row of data due to incompete entry
    for i,df in enumerate(df_list):
        df_list[i] = df[:-1]

    # Concat two MC df's into one df, ignore_index is important to make sure indexes append properly
    full_data = pd.concat(df_list, ignore_index=True)
    
    # Reindex the df so it is randomized
    # np.random.seed(42) # Uncomment this line to get the same shuffle each time
    #full_data = full_data.reindex(np.random.permutation(full_data.index))
    print(full_data)
    
    le = LabelEncoder()
    x = full_data[["Xcore", "Ycore", "Theta", "Phi", "MD Rp", "MD Psi","Energy (EeV)", "Xmax", "RA", "Declination", "Mir ID", "Pass Code"]].values
    y = le.fit_transform(full_data["Primary ID"])
    #y = full_data["Primary ID"].values
    species = le.classes_
    
    # Split into validation and training sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
    
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # Numpy to Torch Tensor
    x_train = torch.tensor(x_train, device=device, dtype=torch.float32)
    y_train = torch.tensor(y_train, device=device, dtype=torch.long)

    x_test = torch.tensor(x_test, device=device, dtype=torch.float32)
    y_test = torch.tensor(y_test, device=device, dtype=torch.long)

    return x_train, x_test, y_train, y_test, species

x_train, x_test, y_train, y_test, species = load_data()
print("Training Set Shape:", "x_train -", x_train.shape, "y_train -",y_train.shape[0], "Shape Check -", True if x_train.shape[0] == y_train.shape[0] else False)
print("Testing Set Shape: ", "x_test -",x_test.shape, "y_test -",y_test.shape[0], "Shape Check -", True if x_test.shape[0] == y_test.shape[0] else False)
print("Species:", species, ", Length of NDarray", len(species))

# Create datasets
BATCH_SIZE = 16

dataset_train = TensorDataset(x_train, y_train)
dataloader_train = DataLoader(
    dataset_train, batch_size=BATCH_SIZE, shuffle=True)

dataset_test = TensorDataset(x_test, y_test)
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=True)

# Create model using nn.Sequential
model = nn.Sequential(
    nn.Linear(x_train.shape[1], 50),
    nn.ReLU(),
    nn.Linear(50, 25),
    nn.ReLU(),
    nn.Linear(25, len(species)),
    nn.LogSoftmax(dim=1),
)

model = torch.compile(model,backend="aot_eager").to(device)

loss_fn = nn.CrossEntropyLoss()  # cross entropy loss

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
es = EarlyStopping()

epoch = 0
done = False
while epoch < 1000 and not done:
    epoch += 1
    steps = list(enumerate(dataloader_train))
    pbar = tqdm.tqdm(steps)
    model.train()
    for i, (x_batch, y_batch) in pbar:
        y_batch_pred = model(x_batch.to(device))
        loss = loss_fn(y_batch_pred, y_batch.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss, current = loss.item(), (i + 1) * len(x_batch)
        if i == len(steps) - 1:
            model.eval()
            pred = model(x_test)
            vloss = loss_fn(pred, y_test)
            if es(model, vloss):
                done = True
            pbar.set_description(
                f"Epoch: {epoch}, tloss: {loss}, vloss: {vloss:>7f}, {es.status}"
            )
        else:
            pbar.set_description(f"Epoch: {epoch}, tloss {loss:}")

       # Date          Time  Julian date  Local Siderial    Xcore    Ycore  0  \
0    20140624   53933.92820  2456832.736        4.263530  -6.3408  20.4095  0   
1    20140705   73421.20612  2456843.816        4.955041  -7.1231  18.8881  0   
2    20140723   52732.08209  2456861.727        4.709835  -7.4253  18.9204  0   
3    20140829   74736.05817  2456898.825        5.959167  -8.6920  20.0007  0   
4    20140829  110357.86420  2456898.961        0.535066  -7.3625  22.2284  0   
..        ...           ...          ...             ...      ...      ... ..   
669  20140626   73318.90217  2456834.815        4.795622  -6.1932  22.2920  0   
670  20140626   74059.02815  2456834.820        4.829239  -7.9590  20.9098  0   
671  20140626   74144.29622  2456834.821        4.832520  -7.9716  21.5741  0   
672  20140626   73706.67222  2456834.817        4.812248 -10.0983  20.2481  0   
673  20140626   74152.58822  2456834.821        4.833104  -7.7783  22.1255  0   

       Theta       Phi  MD 

  0%|          | 0/32 [00:00<?, ?it/s][2024-01-25 00:55:13,688] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo start tracing forward
[2024-01-25 00:55:13,754] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo done tracing forward (RETURN_VALUE)
[2024-01-25 00:55:13,759] torch._dynamo.output_graph: [INFO] Step 2: calling compiler function compiler_fn
[2024-01-25 00:55:13,962] torch._dynamo.output_graph: [INFO] Step 2: done compiler function compiler_fn
Epoch: 1, tloss 1.4950764179229736:   3%|▎         | 1/32 [00:01<00:32,  1.05s/it][2024-01-25 00:55:14,825] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo start tracing forward
[2024-01-25 00:55:14,861] torch._dynamo.symbolic_convert: [INFO] Step 1: torchdynamo done tracing forward (RETURN_VALUE)
[2024-01-25 00:55:14,864] torch._dynamo.output_graph: [INFO] Step 2: calling compiler function compiler_fn
[2024-01-25 00:55:15,023] torch._dynamo.output_graph: [INFO] Step 2: done compiler function compiler_fn
[2

In [4]:
pred = model(x_test)
vloss = loss_fn(pred, y_test)
print(f"Loss = {vloss}")

Loss = 0.8436582684516907


In [5]:
from sklearn.metrics import accuracy_score

_, predict_classes = torch.max(pred, 1)
correct = accuracy_score(y_test.cpu(), predict_classes.cpu())
print(f"Accuracy: {correct}")

Accuracy: 0.6153846153846154


In [6]:
print(f"Predictions: {predict_classes}")
print(f"Expected: {y_test}")

Predictions: tensor([2, 1, 2, 2, 2, 0, 2, 3, 2, 2, 4, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0, 0, 2, 1,
        3, 1, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 1, 0, 2, 1, 2, 3, 2, 0, 2, 1, 4, 2,
        2, 2, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 0, 1, 0, 2, 1, 2, 1, 3, 2, 0, 1, 3,
        1, 2, 4, 4, 0, 1, 4, 0, 2, 1, 0, 2, 2, 2, 1, 3, 1, 2, 4, 0, 1, 4, 1, 2,
        0, 1, 2, 0, 2, 1, 3, 1, 1, 1, 0, 2, 2, 2, 2, 1, 1, 2, 4, 2, 0, 3, 2, 2,
        0, 4, 2, 3, 3, 2, 2, 1, 2, 1, 3, 4, 1, 2, 2, 1, 0, 3, 2, 2, 2, 0, 1, 2,
        1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1,
        1], device='cuda:0')
Expected: tensor([2, 3, 2, 2, 2, 4, 2, 3, 2, 2, 3, 2, 2, 1, 4, 1, 1, 4, 3, 2, 3, 0, 2, 4,
        0, 4, 2, 2, 1, 2, 2, 0, 1, 2, 1, 4, 0, 0, 2, 1, 2, 4, 2, 0, 2, 0, 0, 2,
        2, 2, 3, 2, 2, 2, 3, 2, 0, 0, 0, 2, 0, 4, 4, 2, 4, 2, 4, 3, 2, 3, 0, 0,
        0, 2, 3, 1, 3, 1, 0, 4, 2, 4, 4, 2, 2, 2, 3, 4, 3, 2, 3, 3, 4, 4, 4, 2,
        0, 1, 2, 0, 2, 0, 0, 4, 3, 4, 1, 2, 2, 2, 2, 3, 4, 2, 4, 2, 

In [7]:
print(species[predict_classes.cpu().detach()])

[3. 2. 3. 3. 3. 1. 3. 4. 3. 3. 5. 3. 3. 2. 2. 2. 1. 2. 2. 3. 1. 1. 3. 2.
 4. 2. 3. 3. 1. 3. 3. 1. 2. 3. 2. 2. 2. 1. 3. 2. 3. 4. 3. 1. 3. 2. 5. 3.
 3. 3. 1. 3. 3. 3. 1. 3. 3. 2. 2. 3. 1. 2. 1. 3. 2. 3. 2. 4. 3. 1. 2. 4.
 2. 3. 5. 5. 1. 2. 5. 1. 3. 2. 1. 3. 3. 3. 2. 4. 2. 3. 5. 1. 2. 5. 2. 3.
 1. 2. 3. 1. 3. 2. 4. 2. 2. 2. 1. 3. 3. 3. 3. 2. 2. 3. 5. 3. 1. 4. 3. 3.
 1. 5. 3. 4. 4. 3. 3. 2. 3. 2. 4. 5. 2. 3. 3. 2. 1. 4. 3. 3. 3. 1. 2. 3.
 2. 3. 3. 3. 2. 3. 3. 3. 3. 3. 2. 1. 3. 3. 3. 2. 3. 3. 3. 3. 2. 3. 3. 2.
 2.]


In [8]:
print(torch.exp(pred[1:10]))
print("Sum Check: ",sum(torch.exp(pred[1])).item())

tensor([[1.2060e-01, 4.6174e-01, 5.1190e-03, 1.3016e-01, 2.8237e-01],
        [8.9354e-04, 1.9929e-05, 9.9646e-01, 1.2002e-03, 1.4311e-03],
        [4.9734e-03, 9.1202e-05, 9.8874e-01, 3.1163e-03, 3.0755e-03],
        [1.7235e-02, 3.7906e-03, 9.4033e-01, 1.5570e-02, 2.3072e-02],
        [6.8592e-01, 1.3318e-02, 1.4276e-01, 4.7620e-02, 1.1038e-01],
        [2.9537e-13, 4.6199e-21, 1.0000e+00, 1.0605e-14, 2.0863e-16],
        [2.5526e-01, 1.1671e-01, 1.1300e-02, 4.1610e-01, 2.0063e-01],
        [6.1601e-02, 4.6370e-03, 8.8633e-01, 3.0486e-02, 1.6942e-02],
        [2.9202e-02, 1.1061e-04, 9.6540e-01, 2.7677e-03, 2.5184e-03]],
       device='cuda:0', grad_fn=<ExpBackward0>)
Sum Check:  1.0
