In [34]:
import dask
import dask.dataframe as dd
import glob
import pandas as pd
from pathlib import Path 

import numpy as np
import scipy.stats as sps
from sklearn import linear_model

from distributed import Client
client = Client()

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [35]:
PATH_MAVEN = Path("/home/marek")
PATH_NGI = Path(PATH_MAVEN) / "maven" / "data" / "sci" / "ngi"
PATH_NGI_L2 = Path(PATH_NGI) / "l2"

NA_VALUES = [" ", "-999", np.inf, "Inf", "inf"]

In [36]:
year = 2019

In [37]:
test_dir = Path(PATH_NGI_L2, f"{year}/01/*0101*.csv")

In [38]:
meta_cols = {
    "orbit": int,
    "alt": float,
    "species": str,
    "abundance": float,
    "t_unix": float
}

In [39]:
ddf = dd.read_csv(
    test_dir, 
    assume_missing=True, 
    usecols=["orbit", "alt", "species", "abundance", "t_unix"],
    include_path_column=True,
    dtype=meta_cols,
    na_values = [" ", "-999", np.inf, "Inf", "inf"]
)

In [40]:
ddf.head()

Unnamed: 0,t_unix,orbit,alt,species,abundance,path
0,1546313000.0,8306,1180.228,Ar,0.0,/home/marek/maven/data/sci/ngi/l2/2019/01/mvn_...
1,1546313000.0,8306,1180.19,Ar,0.0,/home/marek/maven/data/sci/ngi/l2/2019/01/mvn_...
2,1546313000.0,8306,1180.152,Ar,0.0,/home/marek/maven/data/sci/ngi/l2/2019/01/mvn_...
3,1546313000.0,8306,1180.113,Ar,0.0,/home/marek/maven/data/sci/ngi/l2/2019/01/mvn_...
4,1546313000.0,8306,1180.075,Ar,0.0,/home/marek/maven/data/sci/ngi/l2/2019/01/mvn_...


In [44]:
temp_ddf = dd.read_csv(
    test_dir, 
    assume_missing=True, 
    usecols=list(meta_cols.keys()),
    dtype=meta_cols,
    na_values = NA_VALUES
)

In [45]:
temp_ddf.head()

Unnamed: 0,t_unix,orbit,alt,species,abundance
0,1546313000.0,8306,1180.228,Ar,0.0
1,1546313000.0,8306,1180.19,Ar,0.0
2,1546313000.0,8306,1180.152,Ar,0.0
3,1546313000.0,8306,1180.113,Ar,0.0
4,1546313000.0,8306,1180.075,Ar,0.0


In [14]:
def make_orbit_path_map(ddf, orb_span):
    orb_path_map = ddf[["orbit", "path"]].drop_duplicates().compute()
    orb_orb_map = {
        x: list(
            range(x - orb_span//2, x + orb_span//2 + 1)
        ) for x in orb_path_map["orbit"]
    }
    orb_filename_map = {
        x: orb_path_map["path"][orb_path_map["orbit"].isin(orb_orb_map[x])].tolist() 
        for x in orb_orb_map.keys()
}
    return orb_filename_map

def IO_orb(orbdata,io='I') -> pd.DataFrame:
    minalt = orbdata['alt'].min()
    peri_t = orbdata[orbdata['alt']==minalt]['t_unix'].unique()
    #if len(peri_t)>1:
    #    sys.exit('Non-unique time found at periapse '+str(orbdata['orbit'].unique()))
    #else:
    if io == 'I':
        return orbdata[orbdata['t_unix']<=peri_t[0]]
    elif io =='O':
        return orbdata[orbdata['t_unix']>peri_t[0]]
    else:
        return orbdata

In [15]:
DEFAULT_ORBIT_SPAN = 2
#from src.homopause_rolling_orbits import make_orbit_path_map, IO_orb
orb_path_map = make_orbit_path_map(ddf, DEFAULT_ORBIT_SPAN)

In [54]:
#@dask.delayed()
def exo_files(files):
    files = list(files)
    temp_ddf = dd.read_csv(
        files, 
        assume_missing=True, 
        usecols=list(meta_cols.keys()),
        dtype=meta_cols,
        na_values = NA_VALUES
    )
    temp_ddf = temp_ddf.map_partitions(IO_orb, meta=temp_ddf)
    temp_ddf = temp_ddf[(temp_ddf["abundance"] > 0.) & (temp_ddf["species"].isin(["Ar", "CO2"]))]
    df = temp_ddf.compute()
    return df

In [55]:
for orb, files in orb_path_map.items():
    df = exo_files(list(files))
    break

In [56]:
df

Unnamed: 0_level_0,t_unix,orbit,alt,species,abundance
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,float64,int64,float64,object,float64
,...,...,...,...,...
,...,...,...,...,...


In [57]:
df.get_partition(1).head()

Unnamed: 0,t_unix,orbit,alt,species,abundance
0,1546328000.0,8307,1175.881,Ar,0.0
1,1546328000.0,8307,1175.843,Ar,0.0
2,1546328000.0,8307,1175.805,Ar,0.0
3,1546328000.0,8307,1175.767,Ar,0.0
4,1546328000.0,8307,1175.729,Ar,0.0


In [58]:
df.head()

Unnamed: 0,t_unix,orbit,alt,species,abundance
0,1546313000.0,8306,1180.228,Ar,0.0
1,1546313000.0,8306,1180.19,Ar,0.0
2,1546313000.0,8306,1180.152,Ar,0.0
3,1546313000.0,8306,1180.113,Ar,0.0
4,1546313000.0,8306,1180.075,Ar,0.0
