# Combine clouds

Combine clouds from different files and interpolate missing values.

## Prepare the notebook

In [1]:
from glob import glob
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

## Load human evaluations

In [2]:
clouds = pd.read_hdf('clouds_ctio_blanco.h5').set_index(['date', 'quarter'])
clouds.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sday,eday,month,year,clouds,source
date,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1975-01-01,1,1,2,1,1975,0,ctio
1975-01-01,2,1,2,1,1975,0,ctio
1975-01-01,3,1,2,1,1975,0,ctio
1975-01-01,4,1,2,1,1975,0,ctio
1975-01-02,1,2,3,1,1975,0,ctio


### Track cloud value source

In [3]:
clouds['simulated'] = False
clouds

Unnamed: 0_level_0,Unnamed: 1_level_0,sday,eday,month,year,clouds,source,simulated
date,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1975-01-01,1,1,2,1,1975,0,ctio,False
1975-01-01,2,1,2,1,1975,0,ctio,False
1975-01-01,3,1,2,1,1975,0,ctio,False
1975-01-01,4,1,2,1,1975,0,ctio,False
1975-01-02,1,2,3,1,1975,0,ctio,False
...,...,...,...,...,...,...,...,...
2022-09-03,4,3,4,9,2022,0,blanco,False
2022-09-04,1,4,5,9,2022,0,blanco,False
2022-09-04,2,4,5,9,2022,0,blanco,False
2022-09-04,3,4,5,9,2022,0,blanco,False


### Read quarters known to be cloudy from satellite data

In [4]:
clouds.index.names

FrozenList(['date', 'quarter'])

In [5]:
satellite_cloudy_fname = 'satellite_cloudy.txt'
if os.path.exists(satellite_cloudy_fname):
    sat_index_names = ['year', 'month', 'sday', 'quarter']
    satellite_cloudy = pd.read_csv(satellite_cloudy_fname, sep="\t", index_col=sat_index_names)
    old_index_names = clouds.index.names
    clouds = clouds.reset_index().set_index(sat_index_names)
    clouds.loc[satellite_cloudy.index.values, 'clouds'] = 8
    clouds.loc[satellite_cloudy.index.values, 'source'] = 'satellite'
    clouds = clouds.reset_index().set_index(old_index_names)
    
clouds

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,sday,eday,clouds,source,simulated
date,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1975-01-01,1,1975,1,1,2,0,ctio,False
1975-01-01,2,1975,1,1,2,0,ctio,False
1975-01-01,3,1975,1,1,2,0,ctio,False
1975-01-01,4,1975,1,1,2,0,ctio,False
1975-01-02,1,1975,1,2,3,0,ctio,False
...,...,...,...,...,...,...,...,...
2022-09-03,4,2022,9,3,4,0,blanco,False
2022-09-04,1,2022,9,4,5,0,blanco,False
2022-09-04,2,2022,9,4,5,0,blanco,False
2022-09-04,3,2022,9,4,5,0,blanco,False


### Build a stochastic matrix

In [6]:
transitions = pd.DataFrame({'month': clouds.month[:-1].values, 'clouds': clouds.clouds[:-1].values, 'next_clouds': clouds.clouds[1:].values, 'count': 1})
transition_counts = transitions.groupby(['month', 'clouds', 'next_clouds']).count()

Remove transitions to and from missing data:

In [7]:
transition_counts = transition_counts.reset_index().query('(clouds != 9) and (next_clouds != 9)').set_index(['month', 'clouds', 'next_clouds'])

Reshape into a matrix:

In [8]:
transition_matrix = transition_counts.unstack().fillna(0)
transition_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,count,count,count,count,count,count,count,count,count
Unnamed: 0_level_1,next_clouds,0,1,2,3,4,5,6,7,8
month,clouds,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,0,4148.0,135.0,50.0,48.0,23.0,17.0,5.0,2.0,7.0
1,1,166.0,165.0,44.0,20.0,15.0,6.0,4.0,1.0,0.0
1,2,46.0,81.0,70.0,28.0,18.0,3.0,3.0,1.0,2.0
1,3,34.0,21.0,39.0,50.0,16.0,15.0,8.0,3.0,2.0
1,4,21.0,8.0,29.0,27.0,34.0,11.0,5.0,6.0,4.0
...,...,...,...,...,...,...,...,...,...,...
12,4,11.0,3.0,15.0,22.0,24.0,12.0,17.0,2.0,4.0
12,5,8.0,4.0,7.0,5.0,14.0,20.0,12.0,6.0,2.0
12,6,11.0,3.0,7.0,5.0,9.0,5.0,15.0,9.0,11.0
12,7,6.0,0.0,2.0,0.0,3.0,5.0,6.0,13.0,13.0


In [9]:
def counts_to_freq(count_df):
    df = count_df.copy()
    df['sum'] = count_df.sum(axis=1)
    for col in df.columns:
        df[col] = df[col]/df['sum']
    df.drop(columns=['sum'], inplace=True)
    df.columns = df.columns.droplevel(0)
    return df

In [10]:
stoch_matrix = counts_to_freq(transition_matrix)
stoch_matrix

Unnamed: 0_level_0,next_clouds,0,1,2,3,4,5,6,7,8
month,clouds,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0.935287,0.030440,0.011274,0.010823,0.005186,0.003833,0.001127,0.000451,0.001578
1,1,0.394299,0.391924,0.104513,0.047506,0.035629,0.014252,0.009501,0.002375,0.000000
1,2,0.182540,0.321429,0.277778,0.111111,0.071429,0.011905,0.011905,0.003968,0.007937
1,3,0.180851,0.111702,0.207447,0.265957,0.085106,0.079787,0.042553,0.015957,0.010638
1,4,0.144828,0.055172,0.200000,0.186207,0.234483,0.075862,0.034483,0.041379,0.027586
...,...,...,...,...,...,...,...,...,...,...
12,4,0.100000,0.027273,0.136364,0.200000,0.218182,0.109091,0.154545,0.018182,0.036364
12,5,0.102564,0.051282,0.089744,0.064103,0.179487,0.256410,0.153846,0.076923,0.025641
12,6,0.146667,0.040000,0.093333,0.066667,0.120000,0.066667,0.200000,0.120000,0.146667
12,7,0.125000,0.000000,0.041667,0.000000,0.062500,0.104167,0.125000,0.270833,0.270833


In [11]:
def simulate_clouds(clouds, seed=6563):
    random_number_generator = np.random.default_rng(seed)
    
    filled_clouds = clouds.copy()
    filled_clouds['simulated'] = False

    previous_quarter_clouds = clouds.clouds.shift(1)
    missing_clouds = clouds.query('clouds==9')
    for quarter, row in missing_clouds.iterrows():
        if previous_quarter_clouds[quarter] == 9:
            previous_clouds = previous_sim_clouds
        else:
            previous_clouds = previous_quarter_clouds[quarter]
        
        frequencies = stoch_matrix.loc[(row.month, previous_clouds), :]
        random_number = random_number_generator.random()
        filled_clouds.loc[quarter, 'clouds'] = frequencies[frequencies.cumsum() > random_number].index.min()
        filled_clouds.loc[quarter, 'source'] = 'stochastic_matrix'
        previous_sim_clouds = filled_clouds.loc[quarter, 'clouds']
            
    return filled_clouds

In [12]:
filled_clouds = simulate_clouds(clouds)
filled_clouds

Unnamed: 0_level_0,Unnamed: 1_level_0,year,month,sday,eday,clouds,source,simulated
date,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1975-01-01,1,1975,1,1,2,0,ctio,False
1975-01-01,2,1975,1,1,2,0,ctio,False
1975-01-01,3,1975,1,1,2,0,ctio,False
1975-01-01,4,1975,1,1,2,0,ctio,False
1975-01-02,1,1975,1,2,3,0,ctio,False
...,...,...,...,...,...,...,...,...
2022-09-03,4,2022,9,3,4,0,blanco,False
2022-09-04,1,2022,9,4,5,0,blanco,False
2022-09-04,2,2022,9,4,5,0,blanco,False
2022-09-04,3,2022,9,4,5,0,blanco,False


In [13]:
filled_clouds.to_hdf('clouds.h5', 'clouds')