In [1]:
import pickle
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from whittaker_eilers import WhittakerSmoother
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm

In [2]:
def generate_date_pairs(year):
    start_date = datetime(year, 1, 1)
    date_pairs = []
    while start_date.year == year:
        end_date = start_date + timedelta(days=11)
        if end_date.year != year:
            break
        date_pairs.append([start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')])
        start_date = end_date + timedelta(days=1)
    return date_pairs

def prepare_dates():
    list_date = []
    for year in [2021, 2022, 2023]:
        list_date.extend(generate_date_pairs(year))
    list_date = [f"{start.replace('-', '')}_{end.replace('-', '')}" for start, end in list_date]
    return list_date

In [3]:
kdprov='32'
pickle_prov = glob(f'/data/ksa/03_Sampling/data/{kdprov}/*.pkl')
print('Found:', len(pickle_prov), 'data')
list_date = prepare_dates()

Found: 15 data


In [4]:
def do_preparation(ls_pickle):
    with open(ls_pickle, 'rb') as file:
        dt_pkl = pickle.load(file)
    return dt_pkl

In [5]:
pickle_prov=['/data/ksa/03_Sampling/data/32/sampling_48MYU.pkl']

In [17]:
import concurrent.futures
import pandas as pd
from tqdm import tqdm

d_pkl = None
temp = pd.DataFrame()

def process_idpoint(j, dt_pkl, list_date, mgrs_map):
    u = dt_pkl.query('idpoint == @j').sort_values('periode')
    ls_date = pd.DataFrame({'periode': list_date})
    temp2 = pd.merge(ls_date, u, how='left').fillna(0)
    temp2['idpoint'] = j
    temp2['MGRS'] = mgrs_map
    temp2['weight'] = temp2.Sigma0_VH_db.apply(lambda y: 0 if y == 0 else 1)
    if (temp2.weight.sum()/temp2.shape[0])>0.6:
        whittaker_smoother = WhittakerSmoother(lmbda=1, order=2, data_length=temp2.shape[0], weights=temp2['weight'])
        temp2['Sigma0_VH_db_interp'] = whittaker_smoother.smooth(temp2['Sigma0_VH_db'])
        temp2['Sigma0_VV_db_interp'] = whittaker_smoother.smooth(temp2['Sigma0_VV_db'])
        temp2['Sigma0_VH_db_imputted'] = temp2.apply(lambda y: y['Sigma0_VH_db'] if y['Sigma0_VH_db'] != 0 else y['Sigma0_VH_db_interp'], axis=1)
        temp2['Sigma0_VV_db_imputted'] = temp2.apply(lambda y: y['Sigma0_VV_db'] if y['Sigma0_VV_db'] != 0 else y['Sigma0_VV_db_interp'], axis=1)
        return temp2
    else:
        return pd.DataFrame()

num_workers = 10  # Adjust this based on your system's capability

for i in pickle_prov:
    dt_pkl = do_preparation(i)
    list_idpoint = dt_pkl.idpoint.unique()
    mgrs_map = dt_pkl['MGRS'].unique()[0]

    temp_list = [temp]
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        print(num_workers)
        results = list(tqdm(executor.map(process_idpoint, list_idpoint, [dt_pkl]*len(list_idpoint), [list_date]*len(list_idpoint), [mgrs_map]*len(list_idpoint)), total=len(list_idpoint)))

    temp = pd.concat(temp_list + results, ignore_index=True)


10


 94%|█████████▍| 46710/49650 [3:22:50<2:58:58,  3.65s/it]IOStream.flush timed out
100%|██████████| 49650/49650 [3:39:39<00:00,  3.77it/s]  


In [18]:
print(temp.head())

             periode         idpoint   MGRS  Sigma0_VH_db  Sigma0_VV_db  \
0  20210101_20210112  321318003A1#01  48MYU    -16.498114     -6.830626   
1  20210113_20210124  321318003A1#01  48MYU    -20.321995    -10.826198   
2  20210125_20210205  321318003A1#01  48MYU      0.000000      0.000000   
3  20210206_20210217  321318003A1#01  48MYU    -24.251793    -14.901100   
4  20210218_20210301  321318003A1#01  48MYU      0.000000      0.000000   

   weight  Sigma0_VH_db_interp  Sigma0_VV_db_interp  Sigma0_VH_db_imputted  \
0       1           -17.270226            -7.580071             -16.498114   
1       1           -20.017186           -10.463774             -20.321995   
2       0           -21.992033           -12.598032             -21.992033   
3       1           -22.727463           -13.595825             -24.251793   
4       0           -21.756172           -13.070132             -21.756172   

   Sigma0_VV_db_imputted  
0              -6.830626  
1             -10.826198  

In [19]:
len(temp.idpoint.unique())

49638

In [7]:
with open('temp_missing_coverage.pkl','wb') as file:
    pickle.dump(temp,file)

In [13]:
from scipy import stats
import matplotlib.pyplot as plt

In [15]:
temp.groupby('weight').agg('count')

Unnamed: 0_level_0,idpoint
weight,Unnamed: 1_level_1
17,1
18,17
19,49632


In [None]:
d_pkl=None
temp = pd.DataFrame()
for i in pickle_prov:
    dt_pkl = do_preparation(i)
    list_idpoint = dt_pkl.idpoint.unique()
    mgrs_map = dt_pkl['MGRS'].unique()[0]
    temp_list = [temp]
    ls_date=pd.DataFrame({'periode':list_date})
    for j in tqdm(list_idpoint):
        u=dt_pkl.query('idpoint == @j').sort_values('periode')
        temp2=pd.merge(ls_date,u,how='left').fillna(0)
        temp2['idpoint']=j
        temp2['MGRS']=mgrs_map
        temp2['weight']=temp2.Sigma0_VH_db.apply(lambda y: 0 if y==0 else 1)
        temp2 = temp2.iloc[-30:].groupby('idpoint', as_index=False).sum()[['idpoint', 'weight']]
        temp_list.append(temp2)
    temp = pd.concat(temp_list, ignore_index=True)

In [None]:
import pandas as pd
import numpy as np

temp = []

# Preconvert list_date to a DataFrame only once
ls_date = pd.DataFrame({'periode': list_date})

for i in pickle_prov:
    dt_pkl = do_preparation(i)
    list_idpoint = dt_pkl['idpoint'].unique()
    mgrs_value = dt_pkl['MGRS'].unique()[0]  # Assuming this is consistent for all idpoints
    
    for j in tqdm(list_idpoint):
        u = dt_pkl[dt_pkl['idpoint'] == j].sort_values('periode')
        
        # Directly create an empty DataFrame with the necessary columns and append data
        temp2 = ls_date.merge(u[['periode', 'Sigma0_VH_db', 'Sigma0_VV_db']], on='periode', how='left')
        
        # Fill NaN values with 0 in-place
        temp2[['Sigma0_VH_db', 'Sigma0_VV_db']] = temp2[['Sigma0_VH_db', 'Sigma0_VV_db']].fillna(0)
        
        # Add static columns
        temp2['idpoint'] = j
        temp2['MGRS'] = mgrs_value
        
        # Compute weight using NumPy for speed
        temp2['weight'] = np.where(temp2['Sigma0_VH_db'].values != 0, 1, 0)
        
        # Only keep the last 30 records and perform the aggregation
        temp2 = temp2.iloc[-30:].groupby('idpoint', as_index=False)['weight'].sum()
        
        # Collect results in a list
        temp.append(temp2)

# Combine all results at once
temp = pd.concat(temp, ignore_index=True)


In [None]:
pd.DataFrame({'periode':list_date})#,colname='periode')

In [None]:
dt_pkl='321216004A1#01'

In [None]:
u = dt_pkl.query('idpoint == @idpoint').sort_values('periode')
temp=pd.DataFrame()
for j in list_date:
    item = u.query('periode == @j')
    if item.empty:
        item = pd.DataFrame({
            'idpoint': [idpoint],
            'MGRS': [u.MGRS.unique()[0]],
            'Sigma0_VH_db': [0],
            'Sigma0_VV_db': [0],
            'periode': [j]
            })
    temp = pd.concat([temp, item], ignore_index=True)
temp['weight'] = temp['Sigma0_VH_db'].apply(lambda y: 0 if y == 0 else 1)

In [None]:
temp.iloc[30:,].groupby('idpoint').agg('sum')[['weight']].reset_index()

In [None]:
temp.iloc[30:,]

#### temp.iloc[30:,]