In [1]:
import pickle

with open("/data/manon/MasterThesis/NOAA/training_files_1000_new.pkl", "rb") as fp:   #Pickling
    train_files = pickle.load(fp)
with open("/data/manon/MasterThesis/NOAA/testing_files_1000_new.pkl", "rb") as fp:   #Pickling
    test_files = pickle.load(fp)
with open("/data/manon/MasterThesis/NOAA/validation_files_1000_new.pkl", "rb") as fp:   #Pickling
    val_files = pickle.load(fp)

all_files = train_files + test_files + val_files

In [2]:
def convert_name_to_config(name):
    config_name = 'config_' + name[:-4] + '.yml'
    return config_name

list_configs = [convert_name_to_config(name) for name in all_files]

In [None]:
from utils.read_data import water_interpolated, wind_interpolated, wave_interpolated, bathymetry_interpolated
import numpy as np


def get_water_context(path_water, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree=100):

    water_u_interpolation,water_v_interpolation = water_interpolated(path_water)

    npoint_lats = (lat_max-lat_min)*points_per_degree
    npoint_lons = (lon_max-lon_min)*points_per_degree

    # get grid 
    lats = np.linspace(lat_min, lat_max, npoint_lats)
    lons = np.linspace(lon_min, lon_max, npoint_lons)
    
    # get tensor of interpolated values
    context_water_u = np.zeros([72,npoint_lons, npoint_lats])
    context_water_v = np.zeros([72,npoint_lons, npoint_lats])

    for i in range(npoint_lons):
        for j in range(npoint_lats):
            for k in range(72):
                context_water_u[k,i,j] = water_u_interpolation([time_init+k,0,lats[j],lons[i]])
                context_water_v[k,i,j] = water_v_interpolation([time_init+k,0,lats[j],lons[i]])

    return context_water_u, context_water_v

def get_wind_context(path_wind, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree=100):

    wind_u_interpolation,wind_v_interpolation = wind_interpolated(path_wind)
    
    npoint_lats = (lat_max-lat_min)*points_per_degree
    npoint_lons = (lon_max-lon_min)*points_per_degree

    print(npoint_lats, npoint_lons)

    # get grid 
    lats = np.linspace(lat_min, lat_max, npoint_lats)
    lons = np.linspace(lon_min, lon_max, npoint_lons)
    
    # get tensor of interpolated values
    context_wind_u = np.zeros([72,npoint_lons, npoint_lats])
    context_wind_v = np.zeros([72,npoint_lons, npoint_lats])
    
    for i in range(npoint_lons):
        for j in range(npoint_lats):
            for k in range(72):
                context_wind_u[k,i,j] = wind_u_interpolation([time_init+k,lats[j],lons[i]])
                context_wind_v[k,i,j] = wind_v_interpolation([time_init+k,lats[j],lons[i]])

    return context_wind_u, context_wind_v


def get_waves_context(path_waves, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree=100):

    ust_interpolation,vst_interpolation = wave_interpolated(path_waves)
    
    npoint_lats = (lat_max-lat_min)*points_per_degree
    npoint_lons = (lon_max-lon_min)*points_per_degree

    # get grid 
    lats = np.linspace(lat_min, lat_max, npoint_lats)
    lons = np.linspace(lon_min, lon_max, npoint_lons)
    
    # get tensor of interpolated values
    context_waves_u = np.zeros([72,npoint_lons, npoint_lats])
    context_waves_v = np.zeros([72,npoint_lons, npoint_lats])


    for i in range(npoint_lons):
        for j in range(npoint_lats):
            for k in range(72):
                context_waves_u[k,i,j] = ust_interpolation([time_init+k,lats[j],lons[i]])
                context_waves_v[k,i,j] = vst_interpolation([time_init+k,lats[j],lons[i]])

    return context_waves_u, context_waves_v


In [11]:
import os

def get_context(path_water, path_wind, path_waves,lat_min, lon_min, time_init, lat_max, lon_max, name_file,saving_folder,points_per_degree=100):

    path_save = os.path.join(saving_folder,name_file)

    '''if os.path.exists(path_save):
        return path_save
    else:'''
    context_water_u, context_water_v = get_water_context(path_water, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree)
    print('context water ok')
    context_waves_u, context_waves_v = get_waves_context(path_waves, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree)
    print('context waves ok')
    context_wind_u, context_wind_v = get_wind_context(path_wind, lat_min, lon_min, time_init, lat_max, lon_max, points_per_degree)
    print('context wind ok')
    

    #context_bathymetry, context_coasts = get_bathymetry_context(path_bathy, init_lat, init_lon, init_time, d = d_context, npoints = npoints)

    # merge contextes
    #print('Merging context')
    context = np.stack((context_water_u,context_water_v,context_wind_u,context_wind_v,context_waves_u,context_waves_v))
    #assert np.shape(context) == (6,npoints,npoints), f"Wrong shape for the context: {np.shape(context)}"

    print(np.shape(context))

    with open(path_save, 'wb') as f:
        np.save(f,context)

    return path_save

In [12]:
from tqdm import tqdm
import os
import yaml
from utils.read_data import get_initial_position

config_path = '/data/manon/MasterThesis/configs_NOAA/all_configs'
saving_path = '/data/manon/MasterThesis/NOAA/bigcontexts'
point_per_deg = 10

for data_file in tqdm(all_files):
    config_name = convert_name_to_config(data_file)
    filename = 'context_' + data_file[:-4] + '.npy'
    try: 
        #print('Processing file', data_file)
        with open(os.path.join(config_path,config_name), 'r') as f:
            config = yaml.safe_load(f)
    except:
        continue

    _, init_time = get_initial_position(config['PATH_DRIFT'], NOAA = True)

    final_path = get_context(config['PATH_WATER'],config['PATH_WIND'],config['PATH_WAVES'],config['min_lat'], config['min_lon'],init_time, config['max_lat'], config['max_lon'],filename, saving_path, points_per_degree=point_per_deg)

    break

  0%|          | 0/1000 [00:00<?, ?it/s]

context water ok
context waves ok
30 40


  0%|          | 0/1000 [00:36<?, ?it/s]

context wind ok
(6, 72, 40, 30)





## Test reading from big contexts

In [1]:
import os 
import numpy as np
import yaml

saving_path = '/data/manon/MasterThesis/NOAA/bigcontexts'
config_path = '/data/manon/MasterThesis/configs_NOAA/all_configs'

files = os.listdir(saving_path)

In [2]:
files[1]

'context_traj_37291.npy'

In [3]:
test_file = os.path.join(saving_path, files[1])
config_filename = 'config_' + files[1][8:-4] + '.yml'
bigcontext = np.load(test_file)

with open(os.path.join(config_path,config_filename), 'r') as f:
    config = yaml.safe_load(f)

In [4]:
config

{'PATH_DRIFT': '/data/manon/MasterThesis/NOAA/trajectories/traj_37291.csv',
 'PATH_WATER': '/data/manon/MasterThesis/HYCOM/HYCOM_traj_37291.nc4',
 'PATH_WAVES': '/data/manon/MasterThesis/ERA5_Waves/Waves_traj_37291.nc',
 'PATH_WIND': '/data/manon/MasterThesis/ERA5_Wind/Wind_traj_37291.nc',
 'max_lat': 17,
 'max_lon': 141,
 'min_lat': 14,
 'min_lon': 137}

In [3]:
np.shape(bigcontext)

(6, 72, 40, 30)

In [28]:
config

{'PATH_DRIFT': '/data/manon/MasterThesis/NOAA/trajectories/traj_37291.csv',
 'PATH_WATER': '/data/manon/MasterThesis/HYCOM/HYCOM_traj_37291.nc4',
 'PATH_WAVES': '/data/manon/MasterThesis/ERA5_Waves/Waves_traj_37291.nc',
 'PATH_WIND': '/data/manon/MasterThesis/ERA5_Wind/Wind_traj_37291.nc',
 'max_lat': 17,
 'max_lon': 141,
 'min_lat': 14,
 'min_lon': 137}

In [6]:
from utils.read_data import get_initial_position
_, init_time = get_initial_position(config['PATH_DRIFT'], NOAA = True)

In [19]:
lat_min = config['min_lat']
lat_max = config['max_lat']
lon_min = config['min_lon']
lon_max = config['max_lon']

points_per_degree = 10

npoint_lats = (lat_max-lat_min)*points_per_degree
npoint_lons = (lon_max-lon_min)*points_per_degree

# get grid 
lats_full = np.linspace(lat_min, lat_max, npoint_lats)
lons_full = np.linspace(lon_min, lon_max, npoint_lons)
time = np.arange(72) + init_time
variables = np.arange(6)  # 6 variables

In [8]:
bigcontext = np.nan_to_num(bigcontext, nan=0.0)

In [14]:
bigcontext.shape

(6, 72, 40, 30)

In [18]:
time.shape

(72,)

In [None]:
from scipy.interpolate import RegularGridInterpolator
from utils.convert_lats import new_latitude, new_longitude

lat_init = 15
lon_init= 140
d = 50
npoints = 32

d_m = d*1000 #convert to m
lat_max = new_latitude(lat_init, lon_init, d_m / 2)
lat_min = new_latitude(lat_init, lon_init, -d_m / 2)
lon_max = new_longitude(lat_init, lon_init, d_m / 2)
lon_min = new_longitude(lat_init, lon_init, -d_m / 2)

# get grid 
lats = np.linspace(lat_min, lat_max, npoints)
lons = np.linspace(lon_min, lon_max, npoints)

# Create meshgrid for latitudes and longitudes
lon_grid, lat_grid = np.meshgrid(lons, lats)

# Prepare the interpolator
interpolator = RegularGridInterpolator((variables, time, lons_full, lats_full), bigcontext)

# Flatten the spatial grid
points_spatial = np.array([lon_grid.ravel(), lat_grid.ravel()]).T
n_points = points_spatial.shape[0]

time_points = np.full(n_points, init_time)
var_points = np.repeat(variables, n_points)
lat_lon_repeated = np.tile(points_spatial, (len(variables), 1))

# Combine time, spatial, and variable dimensions into a single points array
points = np.column_stack((var_points, np.repeat(time_points, len(variables)), lat_lon_repeated))

# Interpolate all variables
interpolated_values = interpolator(points)

# Reshape the results: (npoints, npoints, variables)
context = interpolated_values.reshape(len(variables), npoints, npoints) #.transpose(1, 2, 0)



In [32]:
context.shape

(32, 32, 6)

# Creating CSV Files for datasets

In [63]:
from data_processing.dataset import DriftPairDataset, DriftPairDataset_Wo_Computation
import pandas as pd

csvfile_train = '/data/manon/MasterThesis/NOAA/nextpoint_ds/contexts/pt32d50/next_point_dataset_val_new.csv'
df = pd.read_csv(csvfile_train)

In [64]:
df.head()

Unnamed: 0.1,Unnamed: 0,Latitude_init,Longitude_init,time_init,Latitude_final,Longitude_final,Lat_phys,Lon_phys,PATH_CONTEXT,name
0,502,-28.3818,53.71343,68849.0,-28.36065,53.71698,-28.36702,53.717979,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv
1,503,-28.41421,53.71144,68848.0,-28.3818,53.71343,-28.400036,53.718109,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv
2,504,-28.44377,53.72484,68847.0,-28.41421,53.71144,-28.430346,53.733463,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv
3,505,-28.46642,53.70544,68846.0,-28.44377,53.72484,-28.453428,53.715549,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv
4,506,-28.494,53.70222,68845.0,-28.46642,53.70544,-28.482584,53.713055,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv


In [65]:
def get_config_name(name):
    config_name = 'config_' + name[:-4] + '.yml'
    fullpath = '/data/manon/MasterThesis/configs_NOAA/all_configs/' + config_name
    return fullpath

df['CONFIG_PATH'] = df['name'].apply(lambda x: get_config_name(x))

In [66]:
import os
saving_path = '/data/manon/MasterThesis/NOAA/bigcontexts'

def get_bigcontext_name(name):
    contextname = saving_path + '/context_' + name[:-4] + '.npy'
    if os.path.exists(contextname):
        return contextname
    else:
        return None

df['PATH_BIG_CONTEXT'] = df['name'].apply(lambda x: get_bigcontext_name(x))

In [67]:
df.isnull().all()

Unnamed: 0          False
Latitude_init       False
Longitude_init      False
time_init           False
Latitude_final      False
Longitude_final     False
Lat_phys            False
Lon_phys            False
PATH_CONTEXT        False
name                False
CONFIG_PATH         False
PATH_BIG_CONTEXT    False
dtype: bool

In [68]:
from utils.read_data import get_initial_position
import yaml

def get_init_time(path_context):
    with open(path_context, 'r') as f:
        config = yaml.safe_load(f)
    _, init_time = get_initial_position(config['PATH_DRIFT'], NOAA = True)
    return init_time

df['init_time_bigcontext'] = df['CONFIG_PATH'].apply(lambda x: get_init_time(x))

In [69]:
df.head()

Unnamed: 0.1,Unnamed: 0,Latitude_init,Longitude_init,time_init,Latitude_final,Longitude_final,Lat_phys,Lon_phys,PATH_CONTEXT,name,CONFIG_PATH,PATH_BIG_CONTEXT,init_time_bigcontext
0,502,-28.3818,53.71343,68849.0,-28.36065,53.71698,-28.36702,53.717979,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0
1,503,-28.41421,53.71144,68848.0,-28.3818,53.71343,-28.400036,53.718109,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0
2,504,-28.44377,53.72484,68847.0,-28.41421,53.71144,-28.430346,53.733463,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0
3,505,-28.46642,53.70544,68846.0,-28.44377,53.72484,-28.453428,53.715549,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0
4,506,-28.494,53.70222,68845.0,-28.46642,53.70544,-28.482584,53.713055,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0


In [70]:
df['PREVIOUS_PATH_CONTEXT'] = df['PATH_CONTEXT'].shift(-1)
df['Latitude_prev'] = df['Latitude_init'].shift(-1)
df['Longitude_prev'] = df['Longitude_init'].shift(-1)

In [71]:
condition = (df['Latitude_final'].shift(-1)==df['Latitude_init']) & (df['Longitude_final'].shift(-1)==df['Longitude_init']) 
df_previous = df.where(condition)

In [72]:
df_previous = df_previous.dropna()

In [73]:
df_previous.head()

Unnamed: 0.1,Unnamed: 0,Latitude_init,Longitude_init,time_init,Latitude_final,Longitude_final,Lat_phys,Lon_phys,PATH_CONTEXT,name,CONFIG_PATH,PATH_BIG_CONTEXT,init_time_bigcontext,PREVIOUS_PATH_CONTEXT,Latitude_prev,Longitude_prev
0,502.0,-28.3818,53.71343,68849.0,-28.36065,53.71698,-28.36702,53.717979,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.41421,53.71144
1,503.0,-28.41421,53.71144,68848.0,-28.3818,53.71343,-28.400036,53.718109,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.44377,53.72484
2,504.0,-28.44377,53.72484,68847.0,-28.41421,53.71144,-28.430346,53.733463,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.46642,53.70544
3,505.0,-28.46642,53.70544,68846.0,-28.44377,53.72484,-28.453428,53.715549,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.494,53.70222
4,506.0,-28.494,53.70222,68845.0,-28.46642,53.70544,-28.482584,53.713055,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,/data/manon/MasterThesis/configs_NOAA/all_conf...,/data/manon/MasterThesis/NOAA/bigcontexts/cont...,68778.0,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.52174,53.70157


In [74]:
df_previous['lat1'] = df_previous['Latitude_init'].shift(1)
df_previous['lon1'] = df_previous['Longitude_init'].shift(1)
df_previous['lat2'] = df_previous['Latitude_init'].shift(2)
df_previous['lon2'] = df_previous['Longitude_init'].shift(2)
df_previous['lat3'] = df_previous['Latitude_init'].shift(3)
df_previous['lon3'] = df_previous['Longitude_init'].shift(3)
df_previous['name3'] = df_previous['name'].shift(3)

condition = (df_previous['name'].shift(-1)==df_previous['name3'])
df_previous_ok = df_previous.where(condition)
df_previous_ok = df_previous_ok.dropna()

In [75]:
df_previous_ok.head()

Unnamed: 0.1,Unnamed: 0,Latitude_init,Longitude_init,time_init,Latitude_final,Longitude_final,Lat_phys,Lon_phys,PATH_CONTEXT,name,...,PREVIOUS_PATH_CONTEXT,Latitude_prev,Longitude_prev,lat1,lon1,lat2,lon2,lat3,lon3,name3
3,505.0,-28.46642,53.70544,68846.0,-28.44377,53.72484,-28.453428,53.715549,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,...,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.494,53.70222,-28.44377,53.72484,-28.41421,53.71144,-28.3818,53.71343,traj_63898.csv
4,506.0,-28.494,53.70222,68845.0,-28.46642,53.70544,-28.482584,53.713055,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,...,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.52174,53.70157,-28.46642,53.70544,-28.44377,53.72484,-28.41421,53.71144,traj_63898.csv
5,507.0,-28.52174,53.70157,68844.0,-28.494,53.70222,-28.511827,53.713295,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,...,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.54795,53.69397,-28.494,53.70222,-28.46642,53.70544,-28.44377,53.72484,traj_63898.csv
6,508.0,-28.54795,53.69397,68843.0,-28.52174,53.70157,-28.539038,53.705891,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,...,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.57424,53.68682,-28.52174,53.70157,-28.494,53.70222,-28.46642,53.70544,traj_63898.csv
7,509.0,-28.57424,53.68682,68842.0,-28.54795,53.69397,-28.56632,53.697876,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,traj_63898.csv,...,/data/manon/MasterThesis/NOAA/nextpoint_ds/con...,-28.59682,53.68177,-28.54795,53.69397,-28.52174,53.70157,-28.494,53.70222,traj_63898.csv


In [76]:
len(df_previous_ok)

9710

In [77]:
saving_path = '/data/manon/MasterThesis/NOAA/nextpoint_ds/contexts/pt32d50/next_point_dataset_val_all_models.csv'
df_previous_ok.to_csv(saving_path)

### Check incorrect times

In [33]:
import pandas as pd
csvfile_train = '/data/manon/MasterThesis/NOAA/nextpoint_ds/contexts/pt32d50/next_point_dataset_train_all_models.csv'
df = pd.read_csv(csvfile_train)

In [34]:
def check_time(row):
    diff = row['time_init'] - row['init_time_bigcontext']
    if diff < 72:
        return True
    else:
        return None

df['OK'] = df.apply(lambda row: check_time(row), axis=1)

In [35]:
df_ok = df.dropna()

In [36]:
print(len(df))
print(len(df_ok))

34860
34515


In [37]:
df.isna().sum()

Unnamed: 0.1               0
Unnamed: 0                 0
Latitude_init              0
Longitude_init             0
time_init                  0
Latitude_final             0
Longitude_final            0
Lat_phys                   0
Lon_phys                   0
PATH_CONTEXT               0
name                       0
CONFIG_PATH                0
PATH_BIG_CONTEXT           0
init_time_bigcontext       0
PREVIOUS_PATH_CONTEXT      0
Latitude_prev              0
Longitude_prev             0
lat1                       0
lon1                       0
lat2                       0
lon2                       0
lat3                       0
lon3                       0
name3                      0
OK                       345
dtype: int64

In [38]:
df_ok=df_ok.drop(columns=['OK', 'name3', 'Unnamed: 0', 'Unnamed: 0.1'])

In [39]:
saving_path = '/data/manon/MasterThesis/NOAA/nextpoint_ds/contexts/pt32d50/next_point_dataset_train_all_models.csv'
df_ok.to_csv(saving_path)