## Data exploration

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from PIL import Image, ImageChops

In [None]:
print(str(len(glob.glob("./training/11390/2012_01_05_17_06_01_0/*"))) + " Images for sample 1")
print(str(len(glob.glob("./training/*/*"))) + " Different Samples")
# Avg number of images per sample
print(str(len(glob.glob("./training/*/*/*"))/len(glob.glob("./training/*/*"))) + " Average number images per sample")

In [None]:
# Data for a single sample
sample = []
for i in glob.glob("./training/11390/2012_01_05_17_06_01_0/*"):
    print(i)
    sample.append(Image.open(i))

In [None]:
for i in glob.glob("./training/11390/2012_01_05_17_06_01_0/*_continuum.jpg"):
    print(i)

In [None]:
for i in glob.glob("./training/11390/2012_01_05_17_06_01_0/*_magnetogram.jpg"):
    print(i)

In [None]:
for i in glob.glob("./training/11390/2012_01_05_17_06_01_0/*_211.jpg"):
    print(i)

In [None]:
# _# represents AIA wavelength for band #
# Hour Times: 05, 12, 15, 16
# Not sure what continuum images represent
for i in glob.glob("./training/11390/2012_01_05_17_06_01_0/*_304.jpg"):
    print(i)

In [None]:
sample[14]

In [None]:
img = sample[17]

In [None]:
Image.open("./training/11390/2012_01_05_17_06_01_0/2012-01-05T153601__magnetogram.jpg")

In [None]:
print(img.format)
print(img.mode)

In [None]:
img

In [None]:
#8E-07 is peak flux for this sample
# Data Transformation 1: 
i = img.split()[0]
len(i.histogram())

## Baseline network

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define the CNN architecture
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Convolutional layer (sees 256x256x3 image tensor)
        self.conv1 = nn.Conv2d(in_channels=10, out_channels=16, kernel_size=3, padding=1)
        # Convolutional layer (sees 128x128x16 tensor)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        # Convolutional layer (sees 64x64x32 tensor)
        self.conv3 = nn.Conv2d(32, 64, 3, padding=1)
        # Max pooling layer
        self.pool = nn.MaxPool2d(2, 2)
        # Linear layer (64 * 32 * 32 = 65536)
        self.fc1 = nn.Linear(64 * 32 * 32, 512)
        # Linear layer (512 -> 10)
        self.fc2 = nn.Linear(512, 10)
        # Dropout layer (p=0.25)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        # Add sequence of convolutional and max pooling layers
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        # Flatten image input
        x = x.view(-1, 64 * 32 * 32)
        # Add dropout layer
        x = self.dropout(x)
        # Add 1st hidden layer, with relu activation function
        x = F.relu(self.fc1(x))
        # Add dropout layer
        x = self.dropout(x)
        # Add 2nd hidden layer, with relu activation function
        x = self.fc2(x)
        return x

# Instantiate the CNN
model = SimpleCNN()
print(model)


## Training Loop
A single input will be a 4x256x256x10 matrix where dimensions represent: [timeinterval x height x width x wavelength/magnetogram]

In [None]:
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np

# Get a list of all active region numbers
all_files = glob.glob("./training/*")

# Create a KFold object
kf = KFold(n_splits=10)

wavelengths = ["94","131", "171","193","211","304","335","1700","continuum","magnetogram"]
x = torch.zeros((1,4,256,256,10))
y = torch.zeros((1,1))
df = pd.read_csv('training/meta_data.csv')

# Use the KFold object to split the data into 10 folds
for fold, (train_index, val_index) in enumerate(kf.split(all_files)):
    train_files = np.array(all_files)[train_index]
    val_files = np.array(all_files)[val_index]

    # First we will loop over every active region number
    for file in train_files:
        for sample in glob.glob(file + "/*"):
            images = torch.empty((4,256,256,1), dtype=torch.int64)
            for wave in wavelengths:
                path = sample + "/*_{}.jpg".format(wave)
                pics = torch.tensor([np.array(Image.open(i)) for i in glob.glob(path)])
                for _ in range(4 - len(pics)):
                    pics = torch.cat((pics, torch.zeros((1, 256,256))), 0)
                pics = pics.reshape(4,256,256,1)
                images = torch.cat((images, pics), 3)
            images = images[:,:,:,1:]
            images = images.reshape(1,4,256,256,10)
            x = torch.cat((x, images), 0)
            idx = path.split("/")[2] + "_"+ path.split("/")[3]
            y = torch.cat((y, torch.tensor(df[df["id"] == idx]['peak_flux'].iloc[0]).reshape(-1,1)), 0)

    break

In [None]:
print(x.shape)
print(y)
print(y.shape)