This notebook is looking at the GMM implementation with hold-out sets. Namely, for each combination of macro windspeed and direction, we hold out those observations from the data, train the GMM and then sample from the GMM for that held out category.

GMM code is the same as that from `train_gmm.ipynb`.

This notebook does the following things:
1. Assembles the dataset
2. Train the GMM models
3. Generate and save samples using the emergent distribution from the sampling
4. Looking at a sample to evaluate what is happening with the under sampling of some categories
5. Generate a save samples with over-sampling to account for the under sampling of some categories

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from scipy import stats
from tqdm.notebook import tqdm

# Assemble Dataset

In [2]:
def wd_s_to_uv(ws, wd):
    """
    Translate wind speed and direction to (u,v).
    """
    return -ws * np.sin(wd * (np.pi / 180.)), -ws * np.cos(wd * (np.pi / 180.))

def wd_s_to_u(ws, wd):
    """
    Translate wind speed and direction to u.
    """
    return -ws * np.sin(wd * (np.pi / 180.))

def wd_s_to_v(ws, wd):
    """
    Translate wind speed and direction to v.
    """
    return -ws * np.cos(wd * (np.pi / 180.))

def uv_to_dir(u, v):
    """
    Wind components --> direction
    """
    sp = np.sqrt(u ** 2 + v ** 2)
    u_prime, v_prime = u / sp, v / sp
    return 360 - np.arccos(-v_prime) * (180 / np.pi)

def deg_to_dir(deg):
    """
    Translates from 360 degree to string direction
    """
    if deg <= 22.5:
        return 'NNE'
    elif deg <= 45.:
        return 'NE'
    elif deg <= 67.5:
        return 'ENE'
    elif deg <= 90.:
        return 'E'
    elif deg <= 112.5:
        return 'ESE'
    elif deg <= 135.:
        return 'SE'
    elif deg <= 157.5:
        return 'SSE'
    elif deg <= 180.:
        return 'S'
    elif deg <= 202.5:
        return 'SSW'
    elif deg <= 225.:
        return 'SW'
    elif deg <= 247.5:
        return 'WSW'
    elif deg <= 270.:
        return 'W'
    elif deg <= 292.5:
        return 'WNW'
    elif deg <= 315.:
        return 'NW'
    elif deg <= 337.5:
        return 'NNW'
    else:
        return 'Unknown'

def speed_num_to_str(speed):
    """
    Converted computed windspeed to str category
    """
    if speed <= 2.235:
        return '(-0.001, 2.235]'
    elif speed <= 5.364:
        return '(2.235, 5.364]'
    elif speed <= 8.047:
        return '(5.364, 8.047]'
    elif speed <= 15.646:
        return '(8.047, 15.646]'
    else:
        return 'Uncategorized'

In [3]:
DATA_DIR = '../../data/combined_macro_micro_wind_data.csv'

data = pd.read_csv(DATA_DIR)
print(data.shape)

# isolate U and V columns
u_cols = [f'u{i}' for i in np.arange(20, 255, 5)]
v_cols = [f'v{i}' for i in np.arange(20, 255, 5)]

# create dataframe with just desired columns
data_df = data[u_cols + v_cols].copy()
data_df.dropna(inplace=True)
print(data_df.shape)

# store drop indices to modify the original dataset
drop_idxs = data_df.dropna().index.values
data = data.loc[drop_idxs]
print(data.shape)

# big dataset
data_full = data_df.values

# define the indices for the u and v components
u_idxs = np.arange(47)
v_idxs = np.arange(47, 94)

# create dataframe for macro
data_wind_macro = data[u_cols + v_cols].copy()

# create columns for macro u and v
data_wind_macro['u_macro'] = data.apply(lambda x: wd_s_to_u(ws=x['macro_ws'], wd=x['macro_wd']), axis=1)
data_wind_macro['v_macro'] = data.apply(lambda x: wd_s_to_v(ws=x['macro_ws'], wd=x['macro_wd']), axis=1)

(7200, 255)
(6542, 94)
(6542, 255)


In [4]:
# create lists of macro wind dir and speed
macro_wind_dirs = data['macro_wd_str'].value_counts().index.values[:4]
macro_wind_speeds = data['macro_ws_str'].value_counts().index.values
print(macro_wind_dirs)
print(macro_wind_speeds)

['WSW' 'SW' 'WNW' 'W']
['(-0.001, 2.235]' '(8.047, 15.646]' '(5.364, 8.047]' '(2.235, 5.364]']


In [5]:
# create 16 masks to filter out every combo of top 4 wind dir and wind speeds
macro_masks = np.zeros(shape=(data.shape[0], 16), dtype=bool)
dir_speed_pairs = {}

for i, dir in enumerate(macro_wind_dirs):
    for j, speed in enumerate(macro_wind_speeds):
        macro_masks[:, 4 * i + j] = ((data['macro_wd_str'] != dir) & (data['macro_ws_str'] != speed)).values
        dir_speed_pairs[4 * i + j] = (dir, speed)

# Train GMM Models

In [6]:
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

In [7]:
def train_gmm(data, num_pcs=7, num_comp=16, max_iter=200, n_init=15):
    """
    Trains GMM by first performing a dimension reduction with PCA.

    Returns GMM and necessary PCA components.

    Parameters
    ----------
        data     (np arr) : N x p
        num_pcs  (int)    : number of principal components
        num_comp (int)    : number of GMM components
        max_iter (int)    : max number of EM alg iterations
        n_init   (int)    : number of EM initializations

    Returns
    -------
        orth_pca_vecs (np arr) : num_pcs x p
        x_bar         (np arr) : p
        gm
    """
    # compute principal components
    pca = PCA()
    pca.fit(data)

    # save orthogongal transformation -- "h" stands for "high-dimensional"
    orth_pca_vecs = pca.components_[:num_pcs]
    
    # apply transformation to data
    x_bar = data.mean(axis=0)
    data_pca = (data - x_bar) @ orth_pca_vecs.T

    # fit GMM
    gm = GaussianMixture(
        n_components=num_comp,
        covariance_type='full',
        max_iter=max_iter,
        n_init=n_init
    ).fit(data_pca)

    return orth_pca_vecs, x_bar, gm

In [8]:
# train the models
NUM_PCS = 7
NUM_GMM_COMPONENTS = 16
pca_vecs = np.zeros(shape=(len(dir_speed_pairs), NUM_PCS, data_wind_macro.shape[1]))
x_bars = np.zeros(shape=(len(dir_speed_pairs), data_wind_macro.shape[1]))
gms = []

for i in tqdm(range(len(dir_speed_pairs))):

    # train the model
    pca_vecs_i, x_bar_i, gm_i = train_gmm(
        data_wind_macro.values[macro_masks[:, i], :],
        num_pcs=NUM_PCS,
        num_comp=NUM_GMM_COMPONENTS
    )

    # save the objects
    pca_vecs[i] = pca_vecs_i
    x_bars[i] = x_bar_i
    gms.append(gm_i)

  0%|          | 0/16 [00:00<?, ?it/s]

In [15]:
# save the above models
for i in range(len(dir_speed_pairs)):

    with open(f'../../models/gmm_hold_out_models/{''.join(dir_speed_pairs[i])}.pkl', 'wb') as f:
        pickle.dump(obj=gms[i], file=f)

In [17]:
# save the above PCA objects
for i in range(len(dir_speed_pairs)):

    with open(f'../../models/gmm_hold_out_models/pca_obj_{''.join(dir_speed_pairs[i])}.npz', 'wb') as f:
        np.savez(file=f, pca_vecs=pca_vecs[i], x_bars=x_bars[i])

# Generate and save samples

In [None]:
# generate data
N = data_wind_macro.shape[0]
data_sampled = np.zeros(shape=(len(dir_speed_pairs), N, data_wind_macro.shape[1]))

for i in range(len(dir_speed_pairs)):

    # sample from gmm -- "ld" == "low-dimensional"
    sample_ld_i = gms[i].sample(n_samples=N)[0]

    # project the sample to higher dimension
    data_sampled[i] = sample_ld_i @ pca_vecs[i] + x_bars[i]

In [None]:
# obtain macro wind dir and speed for each sample
sampled_macro_dir = np.zeros(shape=(16, N), dtype=np.object_)
sampled_macro_speed = np.zeros(shape=(16, N))

for i in range(16):
    for j in range(N):

        # direction
        u_j, v_j = data_sampled[i, j, -2:]
        sampled_macro_dir[i, j] = deg_to_dir(uv_to_dir(u=u_j, v=v_j))

        # speed
        sampled_macro_speed[i, j] = np.sqrt(u_j **2 + v_j ** 2)

In [None]:
# save data frame for each sample WITH macro dir and speed
for i in range(16):
    
    # generate the data frame
    df_i = pd.DataFrame(
        data_sampled[0],
        columns=u_cols + v_cols + ['macro_u', 'macro_v']
    )
    
    df_i['macro_wd_str'] = sampled_macro_dir[i]
    ws_i_ser = pd.Series(sampled_macro_speed[i])
    df_i['macro_ws_str'] = ws_i_ser.apply(speed_num_to_str)

    # save the data
    dir_i, speed_i = dir_speed_pairs[i]
    SAVE_PATH_I = f'../data/gmm_hold_out/{dir_i}_{speed_i}.csv'
    df_i.to_csv(SAVE_PATH_I)

# Over-sampling to achieve correct amounts of data

Steps
1. Calculate how many samples we need to obtain for each category
2. For each category, sample from the corresponding model enough times to obtain enough samples in the held-out category.

In [None]:
# calculate the number of sampels in each category
category_counts = np.zeros(16, dtype=int)
for i in range(16):
    dir_i, speed_i = dir_speed_pairs[i]
    category_counts[i] = ((data['macro_wd_str'] == dir_i) & (data['macro_ws_str'] == speed_i)).sum()
    print(f'Category: {dir_i}-{speed_i} | Number of Samples: {category_counts[i]}')

In [None]:
def sample_gmm(gmm, num_samps, pca_vectors, x_bar):
    """
    Sample GMM and compute macro directions and windspeeds.

    Parameters
    ----------
        gmm         (sklearn gmm model) : fitted model
        num_samps   (int)               : total number of samples to generate
        pca_vectors (np arr)            : pre-computed principal components
        x_bar       (np arr)            : column averages of original design matrix

    Returns
    -------
        df_sample (pd DataFrame)
    """
    # sample from gmm -- "ld" == "low-dimensional"
    sample_ld = gmm.sample(n_samples=num_samps)[0]

    # project the sample to higher dimension
    data_sampled = sample_ld @ pca_vectors + x_bar

    macro_dir = np.zeros(num_samps, dtype=np.object_)
    macro_speed = np.zeros(num_samps)
    for j in range(num_samps):

        # direction
        u_j, v_j = data_sampled[j, -2:]
        macro_dir[j] = deg_to_dir(uv_to_dir(u=u_j, v=v_j))

        # speed
        macro_speed[j] = np.sqrt(u_j **2 + v_j ** 2)

    # create output dataframe
    df_sample = pd.DataFrame(data_sampled, columns=u_cols + v_cols + ['macro_u', 'macro_v'])
    df_sample['macro_wd_str'] = macro_dir
    df_sample['macro_ws_str'] = macro_speed
    df_sample['macro_ws_str'] = df_sample['macro_ws_str'].apply(speed_num_to_str)

    return df_sample
    

def sample_gmm_until_enough(
    gmm, num_samps, pca_vectors, x_bar,
    total_num_samps, max_samp,
    macro_dir, macro_speed
):
    """
    Samples from a given GMM model until enough samples have been obtained for
    a particular category defined by macro_dir and macro_speed

    Parameters
    ----------
        gmm             (sklearn gmm model) : fitted model
        num_samps       (int)               : number of samples to generate on each attempt
        pca_vectors     (np arr)            : pre-computed principal components
        x_bar           (np arr)            : column averages of original design matrix
        total_num_samps (int)               : the total number of desired samples
        max_samp        (int)               : maximum number of samples to draw
        macro_dir       (str)               : desired macro direction
        macro_speed     (str)               : desired macro speed

    Returns
    -------
        samples         (pd DataFrame) : actual samples
        tot_num_samples (int)          : total number of generated samples (diagnostic)
    """
    num_good_samples = 0
    tot_num_samples = 0
    samples = pd.DataFrame()
    while (num_good_samples < total_num_samps) & (tot_num_samples < max_samp):

        # generate data
        df_sample = sample_gmm(
            gmm=gmm,
            num_samps=num_samps,
            pca_vectors=pca_vectors,
            x_bar=x_bar
        )

        # compute number of good samples
        good_samples = df_sample.loc[(df_sample.macro_wd_str == macro_dir) & (df_sample.macro_ws_str == macro_speed)]

        if good_samples.shape[0] > total_num_samps - num_good_samples:
            good_samples = good_samples.iloc[:(total_num_samps - num_good_samples), :]

        # add samples to output df
        samples = pd.concat(
            [samples, good_samples]
        )

        # update counter
        num_good_samples = samples.shape[0]
        tot_num_samples += num_samps

    return samples.iloc[:total_num_samps], tot_num_samples

In [None]:
# obtain the samples
MAX_SAMP = int(1e8)
sampled_dfs = [None] * 16
tot_num_samples = np.zeros(16)
for i in tqdm(range(16)):
    dir_i, speed_i = dir_speed_pairs[i]
    samples_test, tot_num_samples_test = sample_gmm_until_enough(
        gmm=gms[i],
        num_samps=data.shape[0],
        pca_vectors=pca_vecs[i],
        x_bar=x_bars[i],
        total_num_samps=category_counts[i],
        max_samp=MAX_SAMP,
        macro_dir=dir_i,
        macro_speed=speed_i
    )

    # save
    sampled_dfs[i] = samples_test
    tot_num_samples[i] = tot_num_samples_test

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(np.arange(16), tot_num_samples)
plt.yscale('log')
plt.xticks(np.arange(16))
plt.axhline(MAX_SAMP, linestyle='--', color='gray', label='Max num allowed samples')
plt.title('Total number of draws to achieve sample')
plt.xlabel('Group Number')
plt.legend()
plt.tight_layout()
plt.show()