In [2]:
# Import libraries

# Data processing and manipulation
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt

from typing import Iterable, Any, Tuple, Dict


# Machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss


# Custom models
import importlib
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

import preprocess_data as ppd
import GAMinferenceModels as gam_models

# Data

In [3]:
weather_data_file = "../Data/hourly/hourly_weather_by_state.csv"
power_load_file = "../Data/hourly/hourly_load_by_state.csv"
failure_data_file = "../Data/hourly/hourly_failure_dataset_compressed.csv"

# feature_names_base = ppd.load_feature_bases(weather_data_file, power_load_file)

all_data_df, _, feature_names, target_columns, integer_encoding = ppd.preprocess_data(failure_data_path=failure_data_file,
                                                                                        weather_data_path=weather_data_file,
                                                                                        power_load_data_path=power_load_file,
                                                                                        feature_names=['Temperature', 'Relative_humidity', 'Load', 'State'],
                                                                                        cyclic_features=["Season", "Month", "DayOfWeek", "DayOfYear"],
                                                                                        state_one_hot=False,
                                                                                        initial_MC_state_filter='all',
                                                                                        technology_filter=['Gas Turbine/Jet Engine (Simple Cycle Operation)'],
                                                                                        test_periods=None
                                                                                        )

In [4]:
# temporal features for regional classifiers
all_data_df['month_sin'] = np.sin(2*np.pi*all_data_df['Datetime_UTC'].dt.month/12)
all_data_df['month_cos'] = np.cos(2*np.pi*all_data_df['Datetime_UTC'].dt.month/12)

# Get list of states from one-hot encoded columns
# states_list = [f.split("_", 1)[1] for f in all_data_df.columns if f.startswith("State_")]
idx2state = {v: k for k, v in integer_encoding['States'].items()}
all_data_df['State'] = all_data_df['State'].apply(lambda x: idx2state[x])
states_list = all_data_df['State'].unique().tolist()
states_list.sort()

In [None]:
states_list = ['TEXAS', 'NEW YORK', 'CALIFORNIA']

# Regional Classifiers

# Train Models

In [5]:
importlib.reload(gam_models)

<module 'GAMinferenceModels' from '/Users/malo/MIT Dropbox/Malo Lahogue/Research/Generator-outage-probability/src/GAMinferenceModels.py'>

In [6]:
transition_models, test_datasets, ess_res = gam_models.train_region_models(all_data_df,
                                                                           regions=states_list,
                                                                           classifier_features=['Temperature', 'Relative_humidity', 'Load_CDF', 'Temperature_3Dsum_hot', 'Temperature_3Dsum_cold', 'month_sin', 'month_cos'],
                                                                           clipping_quantile=0.95,
                                                                           gamma=1.0,
                                                                           w_region=True,
                                                                           w_stress=True,
                                                                           test_frac=0.2,
                                                                           verbose=False
                                                                           )

Training regional models: 100%|██████████| 1/1 [07:57<00:00, 477.70s/it]


In [7]:
is2idx = {'A':0, 'D':1, 'O':2}

for state, test_data_state in test_datasets.items():
    prob_models = transition_models[state]
    data = test_data_state.copy()
    data['p'] = None
    for initial_state, model_is in prob_models.items():
        init_idx = is2idx[initial_state]
        is_mask = data['Initial_gen_state'] == init_idx
        test_data = data.loc[is_mask]
        if test_data.empty:
            print(f"   No data for initial state {initial_state} in state {state}. Skipping.")
            continue
        model_res = model_is.predict_proba(test_data)
        data.loc[is_mask, 'p0'] = model_res[:,0]
        data.loc[is_mask, 'p1'] = model_res[:,1]
        data.loc[is_mask, 'p2'] = model_res[:,2]
        for end_state, end_idx in is2idx.items():
            end_mask = ( data['Initial_gen_state'] == init_idx)&(data['Final_gen_state'] == end_idx)
            model_res_end = model_res[:, end_idx][test_data['Final_gen_state'] == end_idx]
            data.loc[end_mask, 'p'] = model_res_end

    data.to_csv('../Results/GAM/full/GAM_full_results_test_' + state + '.csv', index=False)
    

In [4]:
transition_models_Ar, test_datasets_Ar, ess_res_Ar = gam_models.train_region_models(all_data_df,
                                                                           regions=states_list,
                                                                           classifier_features=['Temperature', 'Relative_humidity', 'Load_CDF', 'Temperature_3Dsum_hot', 'Temperature_3Dsum_cold', 'month_sin', 'month_cos'],
                                                                           clipping_quantile=0.95,
                                                                           gamma=1.0,
                                                                           w_region=False,
                                                                           w_stress=True,
                                                                           test_frac=0.2,
                                                                           verbose=False
                                                                           )

Training regional models: 100%|██████████| 47/47 [6:17:32<00:00, 481.96s/it]  


In [6]:
is2idx = {'A':0, 'D':1, 'O':2}

for state, test_data_state in test_datasets_Ar.items():
    prob_models = transition_models_Ar[state]
    data = test_data_state.copy()
    data['p'] = None
    for initial_state, model_is in prob_models.items():
        init_idx = is2idx[initial_state]
        is_mask = data['Initial_gen_state'] == init_idx
        test_data = data.loc[is_mask]
        if test_data.empty:
            print(f"   No data for initial state {initial_state} in state {state}. Skipping.")
            continue
        model_res = model_is.predict_proba(test_data)
        data.loc[is_mask, 'p0'] = model_res[:,0]
        data.loc[is_mask, 'p1'] = model_res[:,1]
        data.loc[is_mask, 'p2'] = model_res[:,2]
        for end_state, end_idx in is2idx.items():
            end_mask = ( data['Initial_gen_state'] == init_idx)&(data['Final_gen_state'] == end_idx)
            model_res_end = model_res[:, end_idx][test_data['Final_gen_state'] == end_idx]
            data.loc[end_mask, 'p'] = model_res_end

    data.to_csv('../Results/GAM/Ar/GAM_Ar_results_test_' + state + '.csv', index=False)
    

   No data for initial state D in state MAINE. Skipping.
   No data for initial state D in state NEW HAMPSHIRE. Skipping.


In [7]:
transition_models_As, test_datasets_As, ess_res_As = gam_models.train_region_models(all_data_df,
                                                                           regions=states_list,
                                                                           classifier_features=['Temperature', 'Relative_humidity', 'Load_CDF', 'Temperature_3Dsum_hot', 'Temperature_3Dsum_cold', 'month_sin', 'month_cos'],
                                                                           clipping_quantile=0.95,
                                                                           gamma=1.0,
                                                                           w_region=True,
                                                                           w_stress=False,
                                                                           test_frac=0.2,
                                                                           verbose=False
                                                                           )

Training regional models: 100%|██████████| 3/3 [23:45<00:00, 475.17s/it]


In [9]:
is2idx = {'A':0, 'D':1, 'O':2}

for state, test_data_state in test_datasets_As.items():
    prob_models = transition_models_As[state]
    data = test_data_state.copy()
    data['p'] = None
    for initial_state, model_is in prob_models.items():
        init_idx = is2idx[initial_state]
        is_mask = data['Initial_gen_state'] == init_idx
        test_data = data.loc[is_mask]
        if test_data.empty:
            print(f"   No data for initial state {initial_state} in state {state}. Skipping.")
            continue
        model_res = model_is.predict_proba(test_data)
        data.loc[is_mask, 'p0'] = model_res[:,0]
        data.loc[is_mask, 'p1'] = model_res[:,1]
        data.loc[is_mask, 'p2'] = model_res[:,2]
        for end_state, end_idx in is2idx.items():
            end_mask = ( data['Initial_gen_state'] == init_idx)&(data['Final_gen_state'] == end_idx)
            model_res_end = model_res[:, end_idx][test_data['Final_gen_state'] == end_idx]
            data.loc[end_mask, 'p'] = model_res_end

    data.to_csv('../Results/GAM/As/GAM_As_results_test_' + state + '.csv', index=False)
    