# Exoplanet Detection CNN Modle



## Setup

In [7]:
import os
import torch
#import mlflow
from torch.utils.data import DataLoader
import numpy as np
from torch.utils.data import Dataset, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd

from imblearn.over_sampling import SMOTE


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
#import mlflow
#import mlflow.pytorch
from torch.optim.lr_scheduler import StepLR

**Connect to Google Drive**

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Processed Lightcurve Data

In [9]:
#Path to processed data saved in my drive
confirmed_planets_dir = '/content/drive/MyDrive/ELE391_Final_Project/data_v4/K2_conf_pt'
false_positives_dir = '/content/drive/MyDrive/ELE391_Final_Project/data_v4/K2_fp_pt'

# Helper function to load tensors and add labels
def load_and_label_tensors(directory, label):
    tensors = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith('.pt'):
            filepath = os.path.join(directory, filename)
            lightcurve = torch.load(filepath)
            tensors.append(lightcurve)  # Add batch dimension
            labels.append(label)  # Assign the label
    return tensors, labels

# Load confirmed planets (label 1)
confirmed_tensors, confirmed_labels = load_and_label_tensors(confirmed_planets_dir, label=1)

# Load false positives (label 0)
false_tensors, false_labels = load_and_label_tensors(false_positives_dir, label=0)

# Combine the tensors and labels
all_tensors = confirmed_tensors + false_tensors
all_labels = confirmed_labels + false_labels

In [10]:
# Stack all light curves into a single tensor (num_samples, 200)
data = torch.stack(all_tensors)

# Convert labels to a tensor (num_samples,)
labels = torch.tensor(all_labels, dtype=torch.long)

print("Data Shape:", data.shape)
print("Labels Shape:", labels.shape)

Data Shape: torch.Size([1002, 400])
Labels Shape: torch.Size([1002])


**Data Augmentatino with SMOTE**

In [11]:
smote = SMOTE(random_state=42)  # You can set a random state for reproducibility
balanced_data_np, balanced_labels_np = smote.fit_resample(data.cpu().numpy(), labels.cpu().numpy())

# Convert back to PyTorch tensors
data = torch.tensor(balanced_data_np, dtype=data.dtype, device=data.device)
labels = torch.tensor(balanced_labels_np, dtype=labels.dtype, device=labels.device)
data = data.unsqueeze(1)

**Normalize Dataset**

In [12]:
# Normalize each sample in the dataset
data_mean = data.mean(dim=-1, keepdim=True)  # Mean across the sequence
data_std = data.std(dim=-1, keepdim=True)    # Standard deviation across the sequence
data = (data - data_mean) / data_std

dataset = TensorDataset(data, labels)

**Split the Dataset**

In [13]:
dataset_size = len(dataset)
train_size = int(0.8* dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

**Create DataLoaders**

In [14]:

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size, shuffle = True, drop_last = True)
val_loader = DataLoader(val_dataset, batch_size, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size, shuffle = False, drop_last = True)

## CNN Model

In [15]:
class ConvolutionalNetwork(nn.Module):
  def __init__(self):
    super(ConvolutionalNetwork, self).__init__()
    #Describe convolutional layer and what it's doing (2 convolutional layers)
    self.conv1 = nn.Conv1d(in_channels = 1, out_channels = 16, kernel_size = 4, stride = 2, padding = 1)
    self.conv2 = nn.Conv1d(in_channels = 16, out_channels = 32, kernel_size = 8, stride = 2, padding = 1)
    self.conv3 = nn.Conv1d(in_channels = 32, out_channels =  64, kernel_size = 12, stride = 2, padding = 1)
    self.conv4 = nn.Conv1d(in_channels = 64, out_channels = 16, kernel_size = 20, stride = 2, padding = 1)
    self.conv5 = nn.Conv1d(in_channels = 16, out_channels = 16, kernel_size = 4, stride = 2, padding = 1)



    self.fc1 = nn.Linear(16 * 7,128)
    self.fc2 = nn.Linear(128,128)
    self.fc3 = nn.Linear(128,2)

  def forward(self, x):
    X = F.relu(self.conv1(x))
    X = F.relu(self.conv2(X))
    X = F.relu(self.conv3(X))
    X = F.relu(self.conv4(X))
    X = F.relu(self.conv5(X))



    X = X.view(-1, 16 * 7)

    #Fully Connected Layers
    X = F.relu(self.fc1(X))
    X = F.relu(self.fc2(X))
    X = self.fc3(X)

    return F.log_softmax(X, dim = 1)


In [16]:
#Create instance of the model
torch.manual_seed(41)
model = ConvolutionalNetwork()

In [17]:
# Loss Function Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

## Training Loop

In [18]:


#For loops of Epochs
epochs = 20
train_losses = []
val_losses = []
train_correct= []
val_correct =[]

scheduler = StepLR(optimizer, step_size=10, gamma=0.5)


for i in range(epochs):
  trn_corr = 0
  val_corr = 0

  for data, labels in train_loader:

    y_pred = model(data)
    loss = criterion(y_pred, labels)

    predicted = torch.max(y_pred.data, 1)[1]
    batch_corr = (predicted == labels).sum()
    trn_corr += batch_corr # keep track as we go along

    #Update our paremeters
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()





  train_losses.append(loss)
  train_correct.append(trn_corr)

  with torch.no_grad():
    for data, labels in val_loader:
      y_val = model(data)
      predicted = torch.max(y_val.data, 1)[1]
      val_corr += (predicted == labels).sum()

    val_loss = criterion(y_val, labels)
  val_losses.append(val_loss.item())
  val_correct.append(val_corr.item())
  scheduler.step()
  train_acc = trn_corr.item() / len(train_loader.dataset) * 100
  val_acc = val_corr.item() / len(val_loader.dataset) * 100

  print(f"Epoch {i+1}/{epochs}")
  print(f"Train Loss: {loss.item():.4f} | Train Accuracy: {train_acc:.2f}%")
  print(f"Val Loss: {val_loss.item():.4f} | Val Accuracy: {val_acc:.2f}%")

Epoch 1/20
Train Loss: 0.6927 | Train Accuracy: 50.94%
Val Loss: 0.7233 | Val Accuracy: 51.25%
Epoch 2/20
Train Loss: 0.7310 | Train Accuracy: 54.92%
Val Loss: 0.7124 | Val Accuracy: 62.50%
Epoch 3/20
Train Loss: 0.4404 | Train Accuracy: 73.28%
Val Loss: 0.6171 | Val Accuracy: 74.38%
Epoch 4/20
Train Loss: 0.5276 | Train Accuracy: 82.27%
Val Loss: 0.3931 | Val Accuracy: 76.25%
Epoch 5/20
Train Loss: 0.2767 | Train Accuracy: 88.75%
Val Loss: 0.1690 | Val Accuracy: 83.12%
Epoch 6/20
Train Loss: 0.1744 | Train Accuracy: 92.03%
Val Loss: 0.6338 | Val Accuracy: 78.12%
Epoch 7/20
Train Loss: 0.1625 | Train Accuracy: 95.00%
Val Loss: 0.5569 | Val Accuracy: 88.75%
Epoch 8/20
Train Loss: 0.0187 | Train Accuracy: 97.27%
Val Loss: 0.6408 | Val Accuracy: 86.88%
Epoch 9/20
Train Loss: 0.0054 | Train Accuracy: 97.73%
Val Loss: 0.3652 | Val Accuracy: 88.12%
Epoch 10/20
Train Loss: 0.0154 | Train Accuracy: 99.14%
Val Loss: 1.1049 | Val Accuracy: 88.75%
Epoch 11/20
Train Loss: 0.0031 | Train Accuracy: 

## Evaluate

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the model on the test set
test_corr = 0
test_loss = 0
all_labels = []
all_predictions = []

# Set the model to evaluation mode
model.eval()

with torch.no_grad():  # Disable gradient computation
    for data, labels in test_loader:
        y_test = model(data)  # Forward pass
        test_loss += criterion(y_test, labels).item()  # Accumulate test loss
        predicted = torch.max(y_test.data, 1)[1]  # Get predictions
        test_corr += (predicted == labels).sum()  # Count correct predictions

        # Store predictions and labels for metrics
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate test accuracy and average loss
test_acc = test_corr.item() / len(test_loader.dataset) * 100
test_loss /= len(test_loader)

# Calculate precision, recall, and F1-score
precision = precision_score(all_labels, all_predictions, average="weighted")
recall = recall_score(all_labels, all_predictions, average="weighted")
f1 = f1_score(all_labels, all_predictions, average="weighted")

print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.2f}%")
print(f"Precision: {precision:.2f} | Recall: {recall:.2f} | F1-Score: {f1:.2f}")

Test Loss: 0.3161 | Test Accuracy: 95.00%
Precision: 0.95 | Recall: 0.95 | F1-Score: 0.95
