In [1]:
import pandas as pd
from datetime import datetime
import gdal
import numpy as np
import subprocess
import glob
# from shapely.geometry import Polygon
import matplotlib.pyplot as plt
from dateutil.parser import parse
from collections import Counter
import os
import torch
from torch import Tensor
from pathlib import Path
from typing import List, Optional, Sequence, Union, Any, Callable
from torchvision.datasets.folder import default_loader
from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.datasets import CelebA
import zipfile

In [2]:
train_labels = pd.read_csv("train_labels.csv")
grid_metadata = pd.read_csv("grid_metadata.csv")
satellite_metadata = pd.read_csv("pm25_satellite_metadata.csv")
satellite_metadata['Date'] =  pd.to_datetime(satellite_metadata['time_end'], format='%Y-%m-%d')
test_labels = pd.read_csv("submission_format.csv")
train_labels["datetime_dt"]= pd.to_datetime(train_labels["datetime"])

In [3]:
# from .types_ import *
from torch import nn

class BaseVAE(nn.Module):
    
    def __init__(self) -> None:
        super(BaseVAE, self).__init__()

    def encode(self, input) :
        raise NotImplementedError

    def decode(self, input):
        raise NotImplementedError

    def sample(self, batch_size:int, current_device: int, **kwargs) :
        raise NotImplementedError

    def generate(self, x, **kwargs) :
        raise NotImplementedError

#     @abstractmethod
    def forward(self, *inputs) :
        pass

#     @abstractmethod
    def loss_function(self, *inputs, **kwargs) :
        pass
from typing import List
from torch.nn import functional as F
class BetaVAE(BaseVAE):

    num_iter = 0 # Global static variable to keep track of iterations

    def __init__(self,
                 in_channels: int,
                 latent_dim: int,
                 hidden_dims: List = None,
                 beta: int = 4,
                 gamma:float = 1000.,
                 max_capacity: int = 25,
                 Capacity_max_iter: int = 1e5,
                 loss_type:str = 'B',
                 **kwargs) -> None:
        super(BetaVAE, self).__init__()

        self.latent_dim = latent_dim
        self.beta = beta
        self.gamma = gamma
        self.loss_type = loss_type
        self.C_max = torch.Tensor([max_capacity])
        self.C_stop_iter = Capacity_max_iter

        modules = []
        if hidden_dims is None:
            hidden_dims = [2,4,8,16,32, 64, 128, 256, 512]

        # Build Encoder
        for h_dim in hidden_dims:
            modules.append(
                nn.Sequential(
                    nn.Conv2d(in_channels, out_channels=h_dim,
                              kernel_size= 3, stride= 2, padding  = 1),
                    nn.BatchNorm2d(h_dim),
                    nn.LeakyReLU())
            )
            in_channels = h_dim

        self.encoder = nn.Sequential(*modules)
        self.fc_mu = nn.Linear(2048, latent_dim)
        self.fc_var = nn.Linear(2048, latent_dim)


        # Build Decoder
        modules = []

        self.decoder_input = nn.Linear(latent_dim, hidden_dims[-1] * 4)

        hidden_dims.reverse()

        for i in range(len(hidden_dims) - 1):
            modules.append(
                nn.Sequential(
                    nn.ConvTranspose2d(hidden_dims[i],
                                       hidden_dims[i + 1],
                                       kernel_size=3,
                                       stride = 2,
                                       padding=1,
                                       output_padding=1),
                    nn.BatchNorm2d(hidden_dims[i + 1]),
                    nn.LeakyReLU())
            )



        self.decoder = nn.Sequential(*modules)

        self.final_layer = nn.Sequential(
                            nn.ConvTranspose2d(hidden_dims[-1],
                                               hidden_dims[-1],
                                               kernel_size=3,
                                               stride=2,
                                               padding=1,
                                               output_padding=1),
                            nn.BatchNorm2d(hidden_dims[-1]),
                            nn.LeakyReLU(),
                            nn.Conv2d(hidden_dims[-1], out_channels= 1,
                                      kernel_size= 3, padding= 1),
                            nn.Tanh())

    def encode(self, input):
        """
        Encodes the input by passing through the encoder network
        and returns the latent codes.
        :param input: (Tensor) Input tensor to encoder [N x C x H x W]
        :return: (Tensor) List of latent codes
        """
        result = self.encoder(input)
        result = torch.flatten(result, start_dim=1)

        # Split the result into mu and var components
        # of the latent Gaussian distribution
        mu = self.fc_mu(result)
        log_var = self.fc_var(result)

        return [mu, log_var]

    def decode(self, z) :
        result = self.decoder_input(z)
        result = result.view(-1, 512, 2, 2)
        result = self.decoder(result)
        result = self.final_layer(result)
        return result

    def reparameterize(self, mu, logvar) :
        """
        Will a single z be enough ti compute the expectation
        for the loss??
        :param mu: (Tensor) Mean of the latent Gaussian
        :param logvar: (Tensor) Standard deviation of the latent Gaussian
        :return:
        """
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return eps * std + mu

    def forward(self, input, **kwargs):
        mu, log_var = self.encode(input)
        z = self.reparameterize(mu, log_var)
        return  [self.decode(z), input, mu, log_var]

    def loss_function(self,
                      *args,
                      **kwargs) -> dict:
        self.num_iter += 1
        recons = args[0]
        input = args[1]
        mu = args[2]
        log_var = args[3]
        kld_weight = kwargs['M_N']  # Account for the minibatch samples from the dataset

        recons_loss =F.mse_loss(recons, input)

        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_var - mu ** 2 - log_var.exp(), dim = 1), dim = 0)

        if self.loss_type == 'H': # https://openreview.net/forum?id=Sy2fzU9gl
            loss = recons_loss + self.beta * kld_weight * kld_loss
        elif self.loss_type == 'B': # https://arxiv.org/pdf/1804.03599.pdf
            self.C_max = self.C_max.to(input.device)
            C = torch.clamp(self.C_max/self.C_stop_iter * self.num_iter, 0, self.C_max.data[0])
            loss = recons_loss + self.gamma * kld_weight* (kld_loss - C).abs()
        else:
            raise ValueError('Undefined loss type.')

        return {'loss': loss, 'Reconstruction_Loss':recons_loss, 'KLD':kld_loss}

    def sample(self,
               num_samples:int,
               current_device: int, **kwargs) :
        """
        Samples from the latent space and return the corresponding
        image space map.
        :param num_samples: (Int) Number of samples
        :param current_device: (Int) Device to run the model
        :return: (Tensor)
        """
        z = torch.randn(num_samples,
                        self.latent_dim)

        z = z.to(current_device)

        samples = self.decode(z)
        return samples

    def generate(self, x, **kwargs) :
        """
        Given an input image x, returns the reconstructed image
        :param x: (Tensor) [B x C x H x W]
        :return: (Tensor) [B x C x H x W]
        """

        return self.forward(x)[0]


In [4]:
import glob
glob.glob("*.pth")

['model47.pth',
 'model55.pth',
 'uncertainty.pth',
 'oceanfraction.pth',
 'watervapor.pth',
 'Smoke.pth',
 'AOD_QA.pth',
 'cosine.pth']

In [5]:
import torch
model45 = torch.load("model47.pth")
model55 = torch.load("model55.pth")
uncertainty = torch.load("uncertainty.pth")
oceanfraction = torch.load("oceanfraction.pth")
watervapor = torch.load("watervapor.pth")
Smoke = torch.load("Smoke.pth")
AOD_QA = torch.load("AOD_QA.pth")
cosine = torch.load("cosine.pth")
# model45.eval()

In [6]:
import glob




class Clean_Data():
    
    def __init__(loc="train"):
        print("total files found " + len(glob.glob(loc+"/maiac/*/*.hdf")))
        all_files= glob.glob("test/maiac/*/*.hdf")
        self.all_data = all_files
        
    def get_all_data_for_loci(self,ds,granule_id,parellel = False):
        each = {}
        metadata = ds.GetMetadata()
        for i in range(len(ds.GetSubDatasets())):
            raster = gdal.Open(ds.GetSubDatasets()[i][0]) #grid5km:cosSZA features only
            each_raster = raster.GetMetadata()

            long_name = each_raster["long_name"]


            all_rasters = []
            if(parellel):
                listi = list(range(int(raster.GetMetadata()["ADDITIONALLAYERS"])))

                N = pool.imap(partial(rast_each_info, rasteri=raster), listi)

                print(N)
            else:
                for j in range(int(raster.GetMetadata()["ADDITIONALLAYERS"])):
                    try:
                        band = raster.GetRasterBand(j+1)
                        band_arr = band.ReadAsArray()
                        all_rasters.append(band_arr.tolist())
                    except:
                        pass
                each[long_name] = all_rasters

        each_data_f = {'file':granule_id,'data':each}
        return each_data_f

    def get_all_data_for_loci_specific(ds,granule_id,parellel = False,i=0):
        each = {}
        s = ds.GetSubDatasets()
        raster = gdal.Open(s[i][0]) #grid5km:cosSZA features only
        each_raster = raster.GetMetadata()
        long_name = each_raster["long_name"]

        all_rasters = []
        if(parellel):
            listi = list(range(int(raster.GetMetadata()["ADDITIONALLAYERS"])))

            N = pool.imap(partial(rast_each_info, rasteri=raster), listi)

            print(N)
        else:
            for j in range(int(raster.GetMetadata()["ADDITIONALLAYERS"])):
                try:
                    band = raster.GetRasterBand(j+1)
                    band_arr = band.ReadAsArray()
                    all_rasters.append(band_arr)
                except:
                    pass
            each[long_name] = all_rasters



        each_data_f = {'file':granule_id,'data':each}
        del ds
        return each_data_f

    # Opens the HDF file
    def load_data(FILEPATH):
        ds = gdal.Open(FILEPATH)
        return ds

    def format_file_path(granule_id):
        year = granule_id[:4]
        res = 'test/maiac/'+year+'/'+granule_id
        return res

    def fetch_subset(granule_id,j=0):
        formatted = format_file_path(granule_id)
        ds = load_data( formatted)
        return get_all_data_for_loci_specific(ds,granule_id,i=j)

    def get_grid_data(metadata, grid_id):
        return metadata[metadata["grid_id"] == grid_id]
    def fetch_training_features(grid_id, datetime, split):
        temp = get_grid_data(grid_metadata, grid_id)
        sat_met = fetch_satellite_meta(satellite_metadata, 
                                   datetime, 
                                   temp.iloc[0]['location'], 
                                   "maiac", 
                                   split)
        counter = 0
        features = None
        for i in range(len(sat_met)):
            counter+=1
            granule_id = sat_met.iloc[i]['granule_id']

            subset = fetch_subset(granule_id)
            if features is None:
                features = subset
            else:
                features+=subset
        return features/counter
    def fetch_training_features(grid_id, datetime, split):
        temp = get_grid_data(grid_metadata, grid_id)
        sat_met = fetch_satellite_meta(satellite_metadata, 
                                   datetime, 
                                   temp.iloc[0]['location'], 
                                   "maiac", 
                                   split)
        counter = 0
        features = None
        granule_id_loc = []
        for i in range(len(sat_met)):
            counter+=1
            granule_id = sat_met.iloc[i]['granule_id']
            granule_id_loc.append(granule_id)
        return granule_id_loc

    from multiprocessing import Pool
    import tqdm

    split = "train"
    tasks = list(zip(train_labels["grid_id"],train_labels["datetime"]))
    def get_ds_loc(i):
        feature = fetch_training_features(i[0], i[1], split)
        return feature


In [7]:
1+1

2

In [8]:
len(all_files)

2444

In [9]:
%%time
all_keys = {list(fetch_subset('20170114T032500_maiac_tpe_0.hdf',i)["data"].keys())[0]:i for i in range(12)}
all_keys

CPU times: user 988 ms, sys: 34.3 ms, total: 1.02 s
Wall time: 1.06 s


{'AOD at 0.47 micron': 0,
 'AOD at 0.55 micron': 1,
 'AOD uncertainty at 0.47 micron, range 0-4': 2,
 'Fine mode fraction for Ocean': 3,
 'Column Water Vapor (in cm liquid water)': 4,
 'AOD_QA': 5,
 'Regional background model used': 6,
 'Smoke Injection Height over local surface height, in meters': 7,
 'cosine of Solar Zenith Angle': 8,
 'cosine of View Zenith Angle': 9,
 'Relative Azimuth Angle': 10,
 'Scattering Angle': 11}

In [10]:
dict_model_track = {'AOD at 0.47 micron':model45,
 'AOD at 0.55 micron':model55,
 'AOD uncertainty at 0.47 micron, range 0-4':uncertainty,
 'Fine mode fraction for Ocean':oceanfraction,
 'Column Water Vapor (in cm liquid water)':watervapor,
 'AOD_QA':AOD_QA,
 'Smoke Injection Height over local surface height, in meters':Smoke,
 'cosine of Solar Zenith Angle':cosine}

In [11]:
one_important = list(dict_model_track.keys())
one_important


['AOD at 0.47 micron',
 'AOD at 0.55 micron',
 'AOD uncertainty at 0.47 micron, range 0-4',
 'Fine mode fraction for Ocean',
 'Column Water Vapor (in cm liquid water)',
 'AOD_QA',
 'Smoke Injection Height over local surface height, in meters',
 'cosine of Solar Zenith Angle']

In [12]:
args = [all_keys[i] for i in one_important]
args

[0, 1, 2, 3, 4, 5, 7, 8]

In [13]:
all_files = [i.split("/")[-1] for i in all_files]

In [14]:
all_files_with_index = [ (i,j) for i in all_files for j in args]

In [15]:
len(all_files_with_index)

19552

In [None]:
from multiprocessing import get_context

# pool = get_context("fork").Pool()
pool = Pool(processes=32)

all_files_d_100 = []
for x in tqdm.tqdm(pool.starmap(fetch_subset, all_files_with_index), total=len(all_files_with_index)):
    all_files_d_100.append(x)

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170201T194000_maiac_la_0.hdf":grid5km:cosSZA: GDALDataset::GetRasterBand(5) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170425T070000_maiac_dl_0.hdf":grid5km:cosSZA: GDALDataset::GetRasterBand(4) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170211T021000_maiac_tpe_0.hdf":grid1km:AOD_QA: GDALDataset::GetRasterBand(4) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170211T021000_maiac_tpe_0.hdf":grid1km:AOD_QA: GDALDataset::GetRasterBand(5) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170325T025000_maiac_tpe_1.hdf":grid1km:Optical_Depth_047: GDALDataset::GetRasterBand(4) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170325T025000_maiac_tpe_1.hdf":grid1km:Optical_Depth_047: GDALDataset::GetRasterBand(5) - Illegal band #

ERROR 5: HDF4_EOS:EOS_GRID:"test/maiac/2017/20170325T025000_maiac_tpe_1.hdf":grid1km:Optical_Depth_047: GDALDataset::GetRasterBand(6) 

In [20]:
data_model_track = {'AOD at 0.47 micron':[],
 'AOD at 0.55 micron':[],
 'AOD uncertainty at 0.47 micron, range 0-4':[],
 'Fine mode fraction for Ocean':[],
 'Column Water Vapor (in cm liquid water)':[],
 'AOD_QA':[],
 'Smoke Injection Height over local surface height, in meters':[],
 'cosine of Solar Zenith Angle':[]}

In [21]:
hashes_list = {}

In [22]:
for i in all_files_d_100:
    each_d = i["data"]
    each_d_k = list(i["data"].keys())[0]
    [data_model_track[each_d_k].append(k) for k in each_d[each_d_k] ]

In [23]:
vals =all_files_d_100[1]["data"]['AOD at 0.55 micron'][0]
hash(vals[0].tobytes())

-116340482115425593

In [24]:
class SatelliteDataset_XONLY(Dataset):
    def __init__(self, file_list, transform=None):
        self.file_list = file_list
        self.transform = transform
        self.max = max([i.max() for i in file_list])

    def __len__(self):
        self.filelength = len(self.file_list)
        return self.filelength

    def __getitem__(self, idx):
        img = self.file_list[idx]
        img = np.where(img <0 , 0, img)/self.max
        img_transformed = self.transform(img)
        
        return img_transformed.float() , 0.0

In [25]:
train_transforms = transforms.Compose([
                                              transforms.ToTensor(),transforms.Resize(1024),])

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [27]:
%%time
data_model_track_datasets = {}
for i,j in data_model_track.items():
    dti = SatelliteDataset_XONLY(j,train_transforms)
    model = dict_model_track[i]
    loader = torch.utils.data.DataLoader(dti, batch_size=500, num_workers=8)
    model.to(device)
    all_predictions = []
    with torch.no_grad():
        for batch in loader:
            all_predictions.append(model45.encode(batch[0].to(device)))
    data_model_track_datasets[i] = all_predictions
    print(i, " done")
    

AOD at 0.47 micron  done
AOD at 0.55 micron  done
AOD uncertainty at 0.47 micron, range 0-4  done
Fine mode fraction for Ocean  done
Column Water Vapor (in cm liquid water)  done
AOD_QA  done
Smoke Injection Height over local surface height, in meters  done
cosine of Solar Zenith Angle  done
CPU times: user 1min 48s, sys: 3min 40s, total: 5min 29s
Wall time: 10min 25s


In [28]:
data_model_track_datasets_f = {}

In [29]:
data_model_track_datasets.keys()

dict_keys(['AOD at 0.47 micron', 'AOD at 0.55 micron', 'AOD uncertainty at 0.47 micron, range 0-4', 'Fine mode fraction for Ocean', 'Column Water Vapor (in cm liquid water)', 'AOD_QA', 'Smoke Injection Height over local surface height, in meters', 'cosine of Solar Zenith Angle'])

In [30]:
each = data_model_track_datasets['AOD at 0.47 micron']

In [31]:
torch.cat([i[0] for i in each])

tensor([[-0.5503,  0.1146, -0.1166,  ..., -0.3067, -0.0568, -0.4609],
        [-0.5504,  0.1146, -0.1166,  ..., -0.3067, -0.0568, -0.4609],
        [-0.5504,  0.1146, -0.1166,  ..., -0.3067, -0.0568, -0.4609],
        ...,
        [-0.5648,  0.1149, -0.1150,  ..., -0.3107, -0.0569, -0.4691],
        [-0.5642,  0.1149, -0.1151,  ..., -0.3106, -0.0569, -0.4688],
        [-0.5647,  0.1149, -0.1150,  ..., -0.3107, -0.0569, -0.4691]],
       device='cuda:0')

In [32]:
for i in data_model_track_datasets.keys():
    each = data_model_track_datasets[i]
    mean = torch.cat([i[0] for i in each])
    var = torch.cat([i[1] for i in each])
    each_d = {}
    each_d["mean"] = mean.cpu().numpy()
    each_d["var"] = var.cpu().numpy()
    data_model_track_datasets_f[i] = each_d

In [33]:
data_model_track_datasets_f

{'AOD at 0.47 micron': {'mean': array([[-0.5503041 ,  0.11458613, -0.11661965, ..., -0.3066923 ,
          -0.05679096, -0.4608949 ],
         [-0.55036366,  0.11458879, -0.11661395, ..., -0.30671048,
          -0.05679384, -0.46093073],
         [-0.5503553 ,  0.11458831, -0.11661317, ..., -0.30670786,
          -0.05679232, -0.46092537],
         ...,
         [-0.56475025,  0.11494646, -0.11504363, ..., -0.3107498 ,
          -0.05692383, -0.46913975],
         [-0.56423956,  0.11494236, -0.11506341, ..., -0.3106317 ,
          -0.05692936, -0.46883395],
         [-0.56471026,  0.11494301, -0.11504397, ..., -0.31073698,
          -0.0569191 , -0.46911243]], dtype=float32),
  'var': array([[-0.22315677,  0.0242437 , -0.00243355, ...,  0.3339639 ,
          -0.01363353, -0.44978496],
         [-0.22317033,  0.02423966, -0.00244427, ...,  0.33398542,
          -0.01363161, -0.44983464],
         [-0.22317082,  0.02423951, -0.00244258, ...,  0.33398318,
          -0.01363004, -0.4498290

In [34]:
# satellite_metadata

In [35]:
hash(data_model_track_datasets_f['AOD at 0.47 micron']["mean"][3].tobytes())

6683762572019853390

In [36]:
data_model_track_datasets_f['AOD at 0.47 micron']["mean"]

array([[-0.5503041 ,  0.11458613, -0.11661965, ..., -0.3066923 ,
        -0.05679096, -0.4608949 ],
       [-0.55036366,  0.11458879, -0.11661395, ..., -0.30671048,
        -0.05679384, -0.46093073],
       [-0.5503553 ,  0.11458831, -0.11661317, ..., -0.30670786,
        -0.05679232, -0.46092537],
       ...,
       [-0.56475025,  0.11494646, -0.11504363, ..., -0.3107498 ,
        -0.05692383, -0.46913975],
       [-0.56423956,  0.11494236, -0.11506341, ..., -0.3106317 ,
        -0.05692936, -0.46883395],
       [-0.56471026,  0.11494301, -0.11504397, ..., -0.31073698,
        -0.0569191 , -0.46911243]], dtype=float32)

In [37]:
len(data_model_track['AOD at 0.47 micron'])

8850

In [38]:
data_model_track['AOD at 0.47 micron']

[array([[-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        ...,
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672]], dtype=int16),
 array([[   713,    739,    763, ..., -28672, -28672, -28672],
        [   731,    723,    702, ..., -28672, -28672, -28672],
        [   804,    762,    767, ..., -28672, -28672, -28672],
        ...,
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672]], dtype=int16),
 array([[-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],
        [-28672, -28672, -28672, ..., -28672, -28672, -28672],

In [39]:
all_files_d_100[0]["data"]['AOD at 0.47 micron'][0]

array([[-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       ...,
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672]], dtype=int16)

In [40]:
all_files_d_100[0]["data"]['AOD at 0.47 micron'][0]

array([[-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       ...,
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672],
       [-28672, -28672, -28672, ..., -28672, -28672, -28672]], dtype=int16)

In [41]:
data_model_track['AOD at 0.47 micron'][0] ==all_files_d_100[0]["data"]['AOD at 0.47 micron'][0]

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [42]:
np.array_equal(data_model_track['AOD at 0.47 micron'][0], all_files_d_100[0]["data"]['AOD at 0.47 micron'][0])

True

In [43]:
data_model_track_datasets.keys()

dict_keys(['AOD at 0.47 micron', 'AOD at 0.55 micron', 'AOD uncertainty at 0.47 micron, range 0-4', 'Fine mode fraction for Ocean', 'Column Water Vapor (in cm liquid water)', 'AOD_QA', 'Smoke Injection Height over local surface height, in meters', 'cosine of Solar Zenith Angle'])

In [44]:
hsh_dict47= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['AOD at 0.47 micron'],
                                    data_model_track_datasets_f['AOD at 0.47 micron']["mean"],
                                    data_model_track_datasets_f['AOD at 0.47 micron']["var"])}

In [45]:
hsh_dict55= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['AOD at 0.55 micron'],
                                    data_model_track_datasets_f['AOD at 0.55 micron']["mean"],
                                    data_model_track_datasets_f['AOD at 0.55 micron']["var"])}

In [46]:
hsh_uncertain= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['AOD uncertainty at 0.47 micron, range 0-4'],
                                    data_model_track_datasets_f['AOD uncertainty at 0.47 micron, range 0-4']["mean"],
                                    data_model_track_datasets_f['AOD uncertainty at 0.47 micron, range 0-4']["var"])}

In [47]:
fraction_uncertain= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['Fine mode fraction for Ocean'],
                                    data_model_track_datasets_f['Fine mode fraction for Ocean']["mean"],
                                    data_model_track_datasets_f['Fine mode fraction for Ocean']["var"])}

In [48]:
wv_uncertain= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['Column Water Vapor (in cm liquid water)'],
                                    data_model_track_datasets_f['Column Water Vapor (in cm liquid water)']["mean"],
                                    data_model_track_datasets_f['Column Water Vapor (in cm liquid water)']["var"])}

In [49]:
AOD_QA_uncertain= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['AOD_QA'],
                                    data_model_track_datasets_f['AOD_QA']["mean"],
                                    data_model_track_datasets_f['AOD_QA']["var"])}

In [50]:
smoke_uncertain= {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['Smoke Injection Height over local surface height, in meters'],
                                    data_model_track_datasets_f['Smoke Injection Height over local surface height, in meters']["mean"],
                                    data_model_track_datasets_f['Smoke Injection Height over local surface height, in meters']["var"])}

In [51]:
cosine = {hash(i.tobytes()):[j,k] for i,j,k in zip(data_model_track['cosine of Solar Zenith Angle'],
                                    data_model_track_datasets_f['cosine of Solar Zenith Angle']["mean"],
                                    data_model_track_datasets_f['cosine of Solar Zenith Angle']["var"])}

In [52]:
data_file= {
    'AOD at 0.47 micron':hsh_dict47,
    'AOD at 0.55 micron':hsh_dict55,
    'AOD uncertainty at 0.47 micron, range 0-4':hsh_uncertain,
    'Fine mode fraction for Ocean':fraction_uncertain,
    'Column Water Vapor (in cm liquid water)':wv_uncertain,
    'Smoke Injection Height over local surface height, in meters':smoke_uncertain,
    'cosine of Solar Zenith Angle':cosine,
    'AOD_QA':AOD_QA_uncertain
    
}

In [53]:
final_processed = []
for i in all_files_d_100:
    each = {}
    data = i["data"]
    data_k = list(data.keys())[0]
    which_file = data_file[data_k]
    
    resiii = []
    for k in data[data_k]:
        try:
            resiii.append( np.concatenate(which_file[hash(k.tobytes())]))
        except:
            pass
    
    each[i["file"]] = { data_k:np.average(resiii,axis=0)}
    final_processed.append(each)
#     break

In [None]:
all

In [54]:
final_processed[0]

{'20170114T032500_maiac_tpe_0.hdf': {'AOD at 0.47 micron': array([-0.55729836,  0.11476561, -0.11583881,  0.25725174, -0.04947025,
          0.24351715, -0.58955777, -0.1076078 , -0.3107169 , -0.04964919,
          0.1328803 , -0.12924325,  0.1518136 ,  0.5606433 ,  0.33208936,
          0.5449922 ,  0.2859234 , -0.00322788, -0.2632717 , -0.03959442,
         -0.10357113, -0.41908753, -0.13195848, -0.16854578,  0.0342277 ,
         -0.00110735,  0.08439159, -0.30841604,  0.10725187, -0.4547249 ,
          0.3085395 ,  0.10951454, -0.23351951, -0.53956723,  0.2769993 ,
         -0.24157175,  0.05518243, -0.24523155,  0.24229145, -0.08235653,
          0.3517058 , -0.53752834, -0.64649963,  0.1451605 , -0.19046006,
          0.17286982,  0.13967209,  0.6337421 , -0.18471625, -0.24463296,
          0.43111748, -0.22125351,  0.36255595, -0.40556288, -0.4743403 ,
          0.22696164,  0.18267138, -0.25730968,  0.5299843 , -0.58268595,
         -0.35649794, -0.12749434, -0.11048275, -0.0605

In [55]:
len(final_processed)

19552

In [56]:
all_names = {i:{} for i in list(set([list(i.keys())[0] for i in final_processed]))}

In [57]:
for i in final_processed:
    file = list(i.keys())[0]
    j = (i[file])
    typei = list(j.keys())[0]
    all_names[file][typei]=j[typei]
            
    

In [61]:
all_names['20170802T023500_maiac_tpe_0.hdf'].keys()

dict_keys(['AOD at 0.47 micron', 'AOD at 0.55 micron', 'AOD uncertainty at 0.47 micron, range 0-4', 'Fine mode fraction for Ocean', 'Column Water Vapor (in cm liquid water)', 'AOD_QA', 'Smoke Injection Height over local surface height, in meters', 'cosine of Solar Zenith Angle'])

Process ForkPoolWorker-118:
Process ForkPoolWorker-119:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/conda/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  Fil

In [62]:
# all_names.keys()

Process ForkPoolWorker-121:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/conda/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/opt/conda/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt


In [None]:
# all_names

In [65]:
test_labels

Unnamed: 0,datetime,grid_id,value
0,2017-01-07T16:00:00Z,1X116,0.0
1,2017-01-07T16:00:00Z,9Q6TA,0.0
2,2017-01-07T16:00:00Z,KW43U,0.0
3,2017-01-07T16:00:00Z,VR4WG,0.0
4,2017-01-07T16:00:00Z,XJF9O,0.0
...,...,...,...
13499,2021-08-24T08:00:00Z,QJHW4,0.0
13500,2021-08-24T08:00:00Z,VBLD0,0.0
13501,2021-08-24T08:00:00Z,WT52R,0.0
13502,2021-08-24T08:00:00Z,ZP1FZ,0.0


In [63]:
import pickle


with open('all_d_test.pickle', 'wb') as handle:
    pickle.dump(all_names, handle, protocol=pickle.HIGHEST_PROTOCOL)

Process ForkPoolWorker-127:
Process ForkPoolWorker-126:
Process ForkPoolWorker-128:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.8/mult

In [64]:
train_labels

Unnamed: 0,datetime,grid_id,value,datetime_dt
0,2018-02-01T08:00:00Z,3S31A,11.400000,2018-02-01 08:00:00+00:00
1,2018-02-01T08:00:00Z,A2FBI,17.000000,2018-02-01 08:00:00+00:00
2,2018-02-01T08:00:00Z,DJN0F,11.100000,2018-02-01 08:00:00+00:00
3,2018-02-01T08:00:00Z,E5P9N,22.100000,2018-02-01 08:00:00+00:00
4,2018-02-01T08:00:00Z,FRITQ,29.800000,2018-02-01 08:00:00+00:00
...,...,...,...,...
34307,2020-12-31T18:30:00Z,P8JA5,368.611111,2020-12-31 18:30:00+00:00
34308,2020-12-31T18:30:00Z,PW0JT,294.425000,2020-12-31 18:30:00+00:00
34309,2020-12-31T18:30:00Z,VXNN3,224.857143,2020-12-31 18:30:00+00:00
34310,2020-12-31T18:30:00Z,VYH7U,287.000000,2020-12-31 18:30:00+00:00


In [69]:
test_labels["datetime_dt"] = pd.to_datetime(test_labels["datetime"])

In [70]:
partial = grid_metadata[["location","grid_id"]]

In [71]:
train_labels = test_labels.merge(partial,on="grid_id")
train_labels.head()

Unnamed: 0,datetime,grid_id,value,datetime_dt,location
0,2017-01-07T16:00:00Z,1X116,0.0,2017-01-07 16:00:00+00:00,Taipei
1,2017-01-08T16:00:00Z,1X116,0.0,2017-01-08 16:00:00+00:00,Taipei
2,2017-01-09T16:00:00Z,1X116,0.0,2017-01-09 16:00:00+00:00,Taipei
3,2017-01-10T16:00:00Z,1X116,0.0,2017-01-10 16:00:00+00:00,Taipei
4,2017-01-11T16:00:00Z,1X116,0.0,2017-01-11 16:00:00+00:00,Taipei


In [72]:
train_labels["date"] = [i.date() for i in train_labels["datetime_dt"]]

In [73]:
satellite_metadata["time_start"]= pd.to_datetime(satellite_metadata["time_start"])

In [74]:
satellite_metadata["time_start"][0].date()

datetime.date(2018, 2, 1)

In [75]:
satellite_metadata["date"] = [i.date() for i in satellite_metadata["time_start"]]

In [76]:
satellite_metadata = satellite_metadata[satellite_metadata["product"]=="maiac"]

In [77]:
locs = satellite_metadata[["granule_id","location","date"]]

In [78]:
set(locs["location"])

{'dl', 'la', 'tpe'}

In [79]:
set(train_labels["location"])

{'Delhi', 'Los Angeles (SoCAB)', 'Taipei'}

In [80]:
loci = {'dl':'Delhi', 'la':'Los Angeles (SoCAB)', 'tpe':'Taipei'}

In [81]:
locs["location"]= [loci[i] for i in locs["location"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  locs["location"]= [loci[i] for i in locs["location"]]


In [82]:
locs

Unnamed: 0,granule_id,location,date
0,20180201T191000_maiac_la_0.hdf,Los Angeles (SoCAB),2018-02-01
1,20180202T195000_maiac_la_0.hdf,Los Angeles (SoCAB),2018-02-02
2,20180203T203000_maiac_la_0.hdf,Los Angeles (SoCAB),2018-02-03
3,20180204T194000_maiac_la_0.hdf,Los Angeles (SoCAB),2018-02-04
4,20180205T202000_maiac_la_0.hdf,Los Angeles (SoCAB),2018-02-05
...,...,...,...
6699,20210819T065000_maiac_dl_0.hdf,Delhi,2021-08-19
6700,20210820T055500_maiac_dl_0.hdf,Delhi,2021-08-20
6701,20210821T064000_maiac_dl_0.hdf,Delhi,2021-08-21
6702,20210822T054500_maiac_dl_0.hdf,Delhi,2021-08-22


In [83]:
finalish = train_labels.merge(locs, on=["date","location"],how='left')

In [84]:
finalish.head()

Unnamed: 0,datetime,grid_id,value,datetime_dt,location,date,granule_id
0,2017-01-07T16:00:00Z,1X116,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf
1,2017-01-07T16:00:00Z,1X116,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_1.hdf
2,2017-01-08T16:00:00Z,1X116,0.0,2017-01-08 16:00:00+00:00,Taipei,2017-01-08,20170108T022500_maiac_tpe_0.hdf
3,2017-01-08T16:00:00Z,1X116,0.0,2017-01-08 16:00:00+00:00,Taipei,2017-01-08,20170108T040000_maiac_tpe_0.hdf
4,2017-01-09T16:00:00Z,1X116,0.0,2017-01-09 16:00:00+00:00,Taipei,2017-01-09,20170109T030500_maiac_tpe_0.hdf


In [85]:
# np.concatenate(list(all_names['20190316T065000_maiac_dl_0.hdf'].values()))

In [91]:
all_names['20170802T023500_maiac_tpe_0.hdf']

{'AOD at 0.47 micron': array([-0.55808395,  0.11477984, -0.11574371,  0.25728688, -0.04940792,
         0.24366061, -0.59033656, -0.1076102 , -0.31095007, -0.04964493,
         0.13294367, -0.12928978,  0.1517825 ,  0.5612682 ,  0.3323322 ,
         0.5458088 ,  0.28618056, -0.00323435, -0.26347253, -0.0395557 ,
        -0.10360149, -0.4194537 , -0.13202445, -0.16856548,  0.03422196,
        -0.00107255,  0.08438057, -0.30862343,  0.107225  , -0.45525014,
         0.30880165,  0.10960071, -0.23357502, -0.54026294,  0.2772022 ,
        -0.24178143,  0.05521212, -0.24531728,  0.24244827, -0.08230755,
         0.35203418, -0.53824776, -0.6474627 ,  0.14517492, -0.19058071,
         0.17290577,  0.13969252,  0.63471156, -0.18477151, -0.24482201,
         0.431629  , -0.22133654,  0.36289605, -0.40596813, -0.47490892,
         0.22711062,  0.18269077, -0.25740826,  0.5306582 , -0.5834872 ,
        -0.35678345, -0.12754077, -0.11051598, -0.06063333,  0.09181044,
        -0.38204738,  0.28566

In [None]:
all_names['20190316T065000_maiac_dl_0.hdf']

In [92]:
ordered = list(all_names['20170802T023500_maiac_tpe_0.hdf'].keys())

In [94]:
eachiuiii = all_names['20170802T023500_maiac_tpe_0.hdf']

In [95]:
np.concatenate(list(eachiuiii.values())).shape

(2048,)

In [96]:
finalish_dict = {}

In [100]:

for i,eachiuiii in all_names.items():
#     finalish_dict[i]
    re = np.concatenate([eachiuiii[i] for i in ordered])
    eacii = {}
#     [eacii[n]=j for n,j in zip(cols,re)]
    for n,j in zip(cols,re):
        eacii[n]=j
    finalish_dict[i]=eacii
    
    

In [103]:
pd.DataFrame(finalish_dict).T.reset_index().to_csv("processed_data_test.csv")

In [104]:
# np.concatenate([eachiuiii[i] for i in ordered]).shape

In [98]:
cols = [ i+"_"+str(j) for i in ordered for j in range(256)]

In [105]:
finii = pd.DataFrame(finalish_dict).T.reset_index()

In [106]:
finii

Unnamed: 0,index,AOD at 0.47 micron_0,AOD at 0.47 micron_1,AOD at 0.47 micron_2,AOD at 0.47 micron_3,AOD at 0.47 micron_4,AOD at 0.47 micron_5,AOD at 0.47 micron_6,AOD at 0.47 micron_7,AOD at 0.47 micron_8,...,cosine of Solar Zenith Angle_246,cosine of Solar Zenith Angle_247,cosine of Solar Zenith Angle_248,cosine of Solar Zenith Angle_249,cosine of Solar Zenith Angle_250,cosine of Solar Zenith Angle_251,cosine of Solar Zenith Angle_252,cosine of Solar Zenith Angle_253,cosine of Solar Zenith Angle_254,cosine of Solar Zenith Angle_255
0,20170802T023500_maiac_tpe_0.hdf,-0.558084,0.114780,-0.115744,0.257287,-0.049408,0.243661,-0.590337,-0.107610,-0.310950,...,0.501047,0.136191,0.164090,-1.337607,-0.239585,1.760710,0.255791,-0.126079,0.189282,0.503428
1,20170121T033000_maiac_tpe_1.hdf,-0.553782,0.114674,-0.116230,0.256980,-0.049639,0.242908,-0.586203,-0.107516,-0.310056,...,0.256331,0.074706,0.051141,-0.882760,-0.089293,0.688563,0.132961,0.027860,0.097618,0.209097
2,20170710T055000_maiac_dl_0.hdf,-0.558843,0.114796,-0.115658,0.257346,-0.049385,0.243791,-0.591058,-0.107625,-0.311074,...,0.303935,0.084806,0.064546,-0.822369,-0.099036,1.033163,0.154799,0.025009,0.107464,0.161448
3,20210718T033500_maiac_tpe_0.hdf,-0.556461,0.114700,-0.116064,0.256991,-0.049471,0.243354,-0.588749,-0.107510,-0.310704,...,0.311173,0.121905,0.035544,-1.612222,-0.237681,1.771729,0.138020,-0.063127,0.080115,0.398185
4,20210818T025000_maiac_tpe_1.hdf,-0.564543,0.114949,-0.115045,0.257812,-0.049129,0.244763,-0.596463,-0.107791,-0.312078,...,0.353503,0.132627,0.077534,-1.031832,-0.131718,1.438628,0.167540,0.036466,0.120251,0.160037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,20170228T025500_maiac_tpe_0.hdf,-0.559287,0.114802,-0.115671,0.257437,-0.049387,0.243854,-0.591509,-0.107667,-0.311124,...,0.292871,0.107643,-0.010598,-1.875918,-0.275423,2.112602,0.101750,-0.100719,0.044907,0.452276
2440,20170322T202000_maiac_la_0.hdf,-0.561326,0.115202,-0.116736,0.258319,-0.048782,0.243426,-0.598564,-0.109822,-0.315572,...,0.445857,0.147389,0.178899,-0.910460,-0.144588,0.942284,0.252768,0.029451,0.207607,0.254454
2441,20171201T023000_maiac_tpe_0.hdf,-0.549250,0.114569,-0.116915,0.256440,-0.049804,0.242080,-0.581849,-0.107376,-0.309292,...,0.628713,0.151977,0.155291,-2.196995,-0.431727,3.384795,0.281848,-0.289001,0.187811,0.739742
2442,20171130T195000_maiac_la_0.hdf,-0.479309,0.080623,-0.082579,0.234267,-0.053488,0.219989,-0.551631,-0.118143,-0.282451,...,0.244460,0.104776,0.014619,-0.892199,-0.018672,0.694771,0.079163,0.109386,0.046578,0.123763


In [107]:
finii = finii.rename(columns={'index': 'granule_id'})

In [112]:
finalisiii = finalish.merge(finii, on ="granule_id")

In [109]:
finalisiii = finalisiii.groupby(["location","datetime"]).mean().reset_index()

In [113]:
finalisiii

Unnamed: 0,datetime,grid_id,value,datetime_dt,location,date,granule_id,AOD at 0.47 micron_0,AOD at 0.47 micron_1,AOD at 0.47 micron_2,...,cosine of Solar Zenith Angle_246,cosine of Solar Zenith Angle_247,cosine of Solar Zenith Angle_248,cosine of Solar Zenith Angle_249,cosine of Solar Zenith Angle_250,cosine of Solar Zenith Angle_251,cosine of Solar Zenith Angle_252,cosine of Solar Zenith Angle_253,cosine of Solar Zenith Angle_254,cosine of Solar Zenith Angle_255
0,2017-01-07T16:00:00Z,1X116,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,-0.116292,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
1,2017-01-07T16:00:00Z,9Q6TA,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,-0.116292,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
2,2017-01-07T16:00:00Z,KW43U,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,-0.116292,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
3,2017-01-07T16:00:00Z,VR4WG,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,-0.116292,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
4,2017-01-07T16:00:00Z,XJF9O,0.0,2017-01-07 16:00:00+00:00,Taipei,2017-01-07,20170107T032000_maiac_tpe_0.hdf,-0.552611,0.114593,-0.116292,...,0.514868,0.139191,0.183938,-1.667788,-0.356834,2.620512,0.295454,-0.233992,0.197363,0.768304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15994,2021-06-05T18:30:00Z,YHOPV,0.0,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,-0.115555,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15995,2021-06-05T18:30:00Z,ZF3ZW,0.0,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,-0.115555,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15996,2021-06-05T18:30:00Z,GVQXS,0.0,2021-06-05 18:30:00+00:00,Delhi,2021-06-05,20210605T071000_maiac_dl_0.hdf,-0.561104,0.114857,-0.115555,...,0.282401,0.094115,0.101234,-0.598972,-0.056394,-0.085390,0.155456,0.061431,0.139627,0.156484
15997,2021-06-17T18:30:00Z,A7UCQ,0.0,2021-06-17 18:30:00+00:00,Delhi,2021-06-17,20210617T060000_maiac_dl_0.hdf,-0.556475,0.114697,-0.116058,...,0.364079,0.141253,0.068522,-1.220976,-0.168101,1.756613,0.158490,0.021657,0.110188,0.200711


In [114]:
finalisiii.to_csv("processed_data_test.csv")

In [354]:
finalisiii.to_csv("processed_data_l.csv")

In [335]:
finii.to_csv("processed_data.csv")

In [283]:
train_labels

Unnamed: 0,datetime,grid_id,value,datetime_dt,location,date
0,2018-02-01T08:00:00Z,3S31A,11.400000,2018-02-01 08:00:00+00:00,Los Angeles (SoCAB),2018-02-01
1,2018-02-03T08:00:00Z,3S31A,27.200000,2018-02-03 08:00:00+00:00,Los Angeles (SoCAB),2018-02-03
2,2018-02-04T08:00:00Z,3S31A,19.844444,2018-02-04 08:00:00+00:00,Los Angeles (SoCAB),2018-02-04
3,2018-02-05T08:00:00Z,3S31A,10.600000,2018-02-05 08:00:00+00:00,Los Angeles (SoCAB),2018-02-05
4,2018-02-06T08:00:00Z,3S31A,20.300000,2018-02-06 08:00:00+00:00,Los Angeles (SoCAB),2018-02-06
...,...,...,...,...,...,...
34307,2020-12-27T08:00:00Z,VBLD0,14.672222,2020-12-27 08:00:00+00:00,Los Angeles (SoCAB),2020-12-27
34308,2020-12-28T08:00:00Z,VBLD0,13.492683,2020-12-28 08:00:00+00:00,Los Angeles (SoCAB),2020-12-28
34309,2020-12-29T08:00:00Z,VBLD0,19.248485,2020-12-29 08:00:00+00:00,Los Angeles (SoCAB),2020-12-29
34310,2020-12-30T08:00:00Z,VBLD0,26.918421,2020-12-30 08:00:00+00:00,Los Angeles (SoCAB),2020-12-30


In [224]:
with open('all_d.pickle', 'rb') as handle:
    b = pickle.load(handle)


In [228]:
b["20180219T023000_maiac_tpe_0.hdf"]['AOD at 0.47 micron']

array([-5.51183879e-01,  1.14624895e-01, -1.16517939e-01,  2.56809831e-01,
       -4.97524142e-02,  2.42434695e-01, -5.83747983e-01, -1.07470028e-01,
       -3.09596211e-01, -4.95937616e-02,  1.32482275e-01, -1.29315153e-01,
        1.51919767e-01,  5.55845976e-01,  3.30459774e-01,  5.38745105e-01,
        2.83685893e-01, -2.99965963e-03, -2.61916280e-01, -3.98741104e-02,
       -1.03742681e-01, -4.16607499e-01, -1.31681383e-01, -1.68525174e-01,
        3.43770832e-02, -1.60373619e-03,  8.45362246e-02, -3.07267547e-01,
        1.07228540e-01, -4.51318234e-01,  3.06745976e-01,  1.09153263e-01,
       -2.32835904e-01, -5.34690619e-01,  2.75598258e-01, -2.40506470e-01,
        5.53333052e-02, -2.44903147e-01,  2.41493478e-01, -8.31173584e-02,
        3.49697739e-01, -5.32527208e-01, -6.39139175e-01,  1.45638704e-01,
       -1.89722419e-01,  1.72584459e-01,  1.39521182e-01,  6.26229107e-01,
       -1.84318006e-01, -2.43525922e-01,  4.27956343e-01, -2.20966503e-01,
        3.59961033e-01, -

In [226]:
len(all_names)

4260

In [178]:
final_processed[0]['20180219T023000_maiac_tpe_0.hdf']['AOD at 0.47 micron'].shape

(256,)

In [160]:
np.concatenate(final_processed[0]['20180219T023000_maiac_tpe_0.hdf']['AOD at 0.47 micron'][0]).shape

(256,)

In [None]:
np.cat

In [200]:
all_names['20180219T023000_maiac_tpe_0.hdf']

{}

In [146]:
len(all_names)

4260

In [147]:
len(all_files)

4260

In [122]:
for k in data[data_k]:
    print(hash(k.tobytes()))

-3919007085415630956
1273037231629836195
-5058059258359658176


In [125]:
hsh_dict47[-5058059258359658176]

[array([-5.51200032e-01,  1.14626490e-01, -1.16518497e-01,  2.56814271e-01,
        -4.97494154e-02,  2.42437094e-01, -5.83765745e-01, -1.07471153e-01,
        -3.09600323e-01, -4.95973788e-02,  1.32484943e-01, -1.29314497e-01,
         1.51919201e-01,  5.55863678e-01,  3.30464005e-01,  5.38760722e-01,
         2.83690900e-01, -3.00240214e-03, -2.61921793e-01, -3.98728326e-02,
        -1.03740767e-01, -4.16615129e-01, -1.31682754e-01, -1.68523073e-01,
         3.43756638e-02, -1.60010322e-03,  8.45326185e-02, -3.07269752e-01,
         1.07227460e-01, -4.51324701e-01,  3.06752443e-01,  1.09153904e-01,
        -2.32836887e-01, -5.34700334e-01,  2.75602639e-01, -2.40510821e-01,
         5.53338639e-02, -2.44907618e-01,  2.41497234e-01, -8.31151158e-02,
         3.49702597e-01, -5.32538176e-01, -6.39161885e-01,  1.45640776e-01,
        -1.89724758e-01,  1.72584042e-01,  1.39521077e-01,  6.26249313e-01,
        -1.84317991e-01, -2.43531644e-01,  4.27963793e-01, -2.20966369e-01,
         3.5

In [None]:
'cosine of Solar Zenith Angle'

In [101]:
hsh_dict[-3919007085415630956][0].shape

(128,)

In [103]:
len(hsh_dict47)

14445

In [106]:
import numpy as np
arraylist = data_model_track['AOD at 0.47 micron']
L = {array.tostring(): array for array in arraylist}
# L.values() # [array([1, 3, 2, 4]), array([1, 2, 3, 4])]

  L = {array.tostring(): array for array in arraylist}


In [107]:
len(L)

14445

In [184]:
len(data_model_track_datasets)

1

In [192]:
def get_runs(loci):
    return model45.encode(item[loci])

In [191]:
item.__len__()

15364

In [137]:

data_model_track_datasets['AOD at 0.47 micron'].__len__()
item = data_model_track_datasets['AOD at 0.47 micron']

ERROR 5: HDF4_EOS:EOS_GRID:"train/maiac/2018/20181020T192500_maiac_la_0.hdf":grid1km:Optical_Depth_047: GDALDataset::GetRasterBand(5) - Illegal band #



In [None]:
%%time
compres, compress_log = model45.encode(vali)

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [199]:
%%time
for i,j in data_model_track_datasets.items():
    

AOD at 0.47 micron 15364
CPU times: user 1min 10s, sys: 13.4 s, total: 1min 23s
Wall time: 5.22 s


In [216]:
torch.stack([j[k][0] for k in range(10)]).shape

torch.Size([10, 1, 1024, 1024])

In [None]:
for i, vdata in enumerate(validation_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)

In [238]:
# images = ImageDataset(paths, transform=transform)
loader = torch.utils.data.DataLoader(j, batch_size=500, num_workers=8)

In [256]:
%%time
model45.to(device)
all_predictions = []
with torch.no_grad():
    for batch in loader:
        all_predictions.append(model45.encode(batch[0].to(device)))

CPU times: user 33.6 s, sys: 48.9 s, total: 1min 22s
Wall time: 1min 13s


In [259]:
len(all_predictions[-1][0])

364

In [250]:

for i in model45.encode(batch[0].to(device)):
    

torch.Size([500, 128])

In [234]:
%%time
with torch.no_grad():
    compres, compress_log = model45.encode(vi)

CPU times: user 13.9 s, sys: 2.31 s, total: 16.2 s
Wall time: 977 ms


In [235]:
%%time
compres, compress_log = model45.encode(vi)

CPU times: user 15.6 s, sys: 1.22 s, total: 16.8 s
Wall time: 979 ms


In [183]:
vali = data_model_track_datasets['AOD at 0.47 micron'][0][0].unsqueeze(0)

In [222]:
compres.shape

torch.Size([200, 128])

In [224]:
j.__len__()

15364

In [225]:
15386/200

76.93