In [1]:
import sys, os
from os.path import join
sys.path.append('/work/mflora/ROAD_SURFACE')
sys.path.insert(0, '/home/monte.flora/python_packages/scikit-explain')
sys.path.insert(0, '/home/monte.flora/python_packages/master/ml_workflow')
sys.path.insert(0,'/home/monte.flora/python_packages/compare-explain-methods/src/paper_1_experiments')
from load_rankings import load_imp

from skexplain import ExplainToolkit
from skexplain.common.importance_utils import to_skexplain_importance

from ml_workflow.ml_methods import norm_aupdc, brier_skill_score
from ml_workflow import CalibratedPipelineHyperOptCV
from ml_workflow.ml_methods import get_bootstrap_score
from calibration_classifier import CalibratedClassifier
from skexplain.common.multiprocessing_utils import run_parallel, to_iterator

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import pickle
from joblib import load

import hyperopt.hp as hp
from probsr_config import TARGET_COLUMN, PREDICTOR_COLUMNS

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
BASE_PATH = '/work/mflora/explainability_work/'
DATA_BASE_PATH = join(BASE_PATH, 'datasets')
MODEL_BASE_PATH = join(BASE_PATH, 'models')

## Create the Association between feature importance and model performance. 

### Train models in the top and bottom 15 predictors. 

In [3]:
def train_top_and_bottom_predictors(target, opt):
    N_FEATURES = 5
    
    model_opt = 'L1_based_feature_selection_with_manual' if opt == 'reduced' else ''
    
    data = {}
    time = 'first_hour'
    param_grid = {
                'l1_ratio': hp.choice('l1_ratio', [0.0001, 0.001, 0.01, 0.1, 0.5, 0.6, 0.8, 1.0]),
                'C': hp.choice('C', [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.0]),
                }

    save_fname = join(DATA_BASE_PATH, f'top_worst_predictors_{target}_{model_opt}_scores.pkl')


    print(f'Working on {target}...')
    # Load training dataset
    df = pd.read_pickle(join(DATA_BASE_PATH, f'{time}_training_matched_to_{target}_0km_dataset'))
    dates = df['Run Date']
    fti = df['FCST_TIME_IDX'].astype(int) 
    
    model_fname = join(MODEL_BASE_PATH,
                           f'LogisticRegression_first_hour_{target}_under_standard_{model_opt}.pkl')
    
    data = load(model_fname)
    original_features = data['features']
    model = data['model']
    params = model.base_estimator.named_steps['model'].get_params()
    params['max_iter'] = 300
    
    del params['n_jobs']
    
    X = df[original_features].astype(float)
    y = df[f'matched_to_{target}_0km'].astype(float).values 
    known_skew = np.mean(y)
        
    fname = join(DATA_BASE_PATH, f'importances_{target}_{opt}_unnormalized.pkl')
    with open(fname, 'rb') as f:
        data = pickle.load(f)     
    
    feature_importances = data['feature_importances']
    results = {m:{} for m in feature_importances.keys() }
    
    for m in feature_importances.keys():
        print('*'*40)
        print(f'\nMethod : {m}\n')
        print('*'*40)
        imp = feature_importances[m]
        
        best_5 = list(feature_importances[m].columns[:N_FEATURES])
        worst_5 = list(feature_importances[m].columns[-N_FEATURES:])
        
        top_scores = []
        for i, _ in enumerate(best_5):
            estimator = LogisticRegression(solver='saga', 
                                       penalty='elasticnet', max_iter=300, random_state=42)
            
            clf = CalibratedPipelineHyperOptCV( base_estimator = estimator,
                                  param_grid = param_grid,
                                  resample='under',
                                  scaler='standard',
                                  imputer=None,
                                  hyperopt='tpe',
                                  n_jobs=1,
                                  max_iter=40,
                                  scorer_kwargs = {'known_skew': known_skew },
                                  cv = 'date_based',
                                  cv_kwargs = {'n_splits' : 5,
                                               'dates' : dates,
                                               'valid_size' : 20 },
                                  )
            
            
            feature_subset = best_5[:i+1]
            print('\n', feature_subset)
            X_subset = X[feature_subset]
            clf.fit(X_subset, y, params=params)
            preds = clf.predict_proba(X_subset)[:,1]
            ds = get_bootstrap_score(y, preds, n_bootstrap=100, 
                                     known_skew = known_skew, 
                                     forecast_time_indices = fti,
                                     metric_mapper={'naupdc' : norm_aupdc})
            top_scores.append(ds)
            
        bottom_scores = []
        for i, _ in enumerate(worst_5[::-1]):
            estimator = LogisticRegression(solver='saga', 
                                       penalty='elasticnet', max_iter=300, random_state=42)
            clf = CalibratedPipelineHyperOptCV( base_estimator = estimator,
                                  param_grid = param_grid,
                                  resample='under',
                                  scaler='standard',
                                  imputer=None,
                                  hyperopt='tpe',
                                  n_jobs=1,
                                  max_iter=40,
                                  scorer_kwargs = {'known_skew': known_skew },
                                  cv = 'date_based',
                                  cv_kwargs = {'n_splits' : 5,
                                               'dates' : dates,
                                               'valid_size' : 20 },
                                  )
            feature_subset = worst_5[::-1][:i+1]
            X_subset = X[feature_subset]
            clf.fit(X_subset, y, params=params)
            preds = clf.predict_proba(X_subset)[:,1]
            ds = get_bootstrap_score(y, preds, n_bootstrap=100, 
                                     known_skew = known_skew, 
                                     forecast_time_indices = fti ,
                                     metric_mapper={'naupdc' : norm_aupdc})
            
            bottom_scores.append(ds)
            
        results[m]['top_scores'] = top_scores
        results[m]['bottom_scores'] = bottom_scores
     
    with open(save_fname, 'wb') as f:
        pickle.dump(results, f)

In [4]:
#train_top_and_bottom_predictors('tornado', 'original')

In [5]:
targets= ['severe_hail', 'severe_wind']
opts = ['original']

run_parallel(train_top_and_bottom_predictors, 
            args_iterator=to_iterator(targets, opts), 
             n_jobs=len(targets)*len(opts), 
             description='Feature Importance Experiment')

Feature Importance Experiment:   0%|                                                                                                                               | 0/2 [00:00<?, ?it/s]

Working on severe_wind...Working on severe_hail...

****************************************

Method : backward_singlepass

****************************************

 ['ws_80_time_max_ens_mean_of_90th']
Unable to convert hyperparam results!
****************************************

Method : backward_singlepass

****************************************

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th', 'lcl_ml_ens_mean_spatial_mean']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'major_axis_length']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th', 'lcl_ml_ens_mean_spatial_mean', 'wz_0to2_time_max_ens_mean_of_90th']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'major_axis_length', 'low_level_lapse_rate_ens_mean_spatial_mean']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'major_axis_length', 'w_up_time_max_ens_mean_of_90th', 'minor_axis_length', 'comp_dz_time_max_ens_mean_of_90th', 'divergence_10m_time_min_ens_mean_of_10th', 'w_up_time_max_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_mean_of_90th']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_of_90th', 'minor_axis_length', 'w_1km_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_spatial_mean', 'major_axis_length', 'hailcast_time_max_ens_mean_of_90th', 'mid_level_lapse_rate_ens_mean_spatial_mean', 'hailcast_time_max_ens_std_spatial_mean', 'comp_dz_time_max_ens_std_of_90th', 'uh_2to5_time_max_ens_std_spatial_mean']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'major_axis_length', 'w_up_time_max_ens_mean_of_90th', 'minor_axis_length', 'comp_dz_time_max_ens_mean_of_90th', 'd

 ['w_up_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_of_90th', 'major_axis_length', 'minor_axis_length', 'divergence_10m_time_min_ens_mean_of_10th', 'w_up_time_max_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_mean_of_90th', 'uh_0to2_time_max_ens_std_spatial_mean', 'dbz_1to3km_max_time_max_ens_mean_of_90th', 'ws_80_time_max_ens_mean_of_90th', 'hailcast_time_max_ens_std_spatial_mean', 'comp_dz_time_max_ens_mean_spatial_mean', 'shear_v_0to6_ens_std_spatial_mean', 'w_down_time_min_ens_mean_of_10th', 'bouyancy_time_min_ens_mean_of_10th']
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam results!
Unable to convert hyperparam res

 ['ws_80_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'minor_axis_length', 'w_1km_time_max_ens_mean_of_90th', 'major_axis_length', 'w_up_time_max_ens_mean_spatial_mean', 'hailcast_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_spatial_mean', 'mid_level_lapse_rate_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_std_spatial_mean', 'hailcast_time_max_ens_std_spatial_mean', 'comp_dz_time_max_ens_std_of_90th', 'bouyancy_time_min_ens_mean_of_10th']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'minor_axis_length', 'w_1km_time_max_ens_mean_of_90th', 'major_axis_length', 'w_up_time_max_ens_mean_spatial_mean', 'hailcast_time_max_ens_mean_of_90th', 'comp_dz_time_max_ens_mean_spatial_mean', 'mid_level_lapse_rate_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_std_spatial_mean', 'hailcast_time_max_ens_std_spatial_mean', 'comp_dz_time_m

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'low_level_lapse_rate_ens_mean_spatial_mean', 'w_up_time_max_ens_mean_of_90th', 'major_axis_length', 'divergence_10m_time_min_ens_mean_of_10th', 'divergence_10m_time_min_ens_std_spatial_mean', 'minor_axis_length', 'w_1km_time_max_ens_mean_spatial_mean', 'ws_80_time_max_ens_mean_spatial_mean', 'dbz_1to3km_max_time_max_ens_mean_of_90th', 'bouyancy_time_min_ens_mean_of_10th']
Unable to convert hyperparam results!

 ['ws_80_time_max_ens_mean_of_90th', 'lcl_ml_ens_mean_spatial_mean']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'low_level_lapse_rate_ens_mean_spatial_mean', 'w_up_time_max_ens_mean_of_90th', 'major_axis_length', 'divergence_10m_time_min_ens_mean_of_10th', 'divergence_10m_time_min_ens_std_spatial_mean', 'minor_axis_length', 'w_1km_time_max_ens_mean_spatial_mean', 'ws_80_time_max_ens_mean_spatial_mean', 'dbz_1t

Unable to convert hyperparam results!
Unable to convert hyperparam results!

 ['major_axis_length', 'minor_axis_length', 'ws_80_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'mid_level_lapse_rate_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_std_spatial_mean', 'uh_0to2_time_max_ens_mean_of_90th', 'w_1km_time_max_ens_mean_of_90th', 'hailcast_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_spatial_mean', 'divergence_10m_time_min_ens_std_spatial_mean', 'srh_0to3_ens_mean_spatial_mean']
Unable to convert hyperparam results!
Unable to convert hyperparam results!

 ['major_axis_length', 'minor_axis_length', 'ws_80_time_max_ens_mean_of_90th', 'wz_0to2_time_max_ens_mean_of_90th', 'mid_level_lapse_rate_ens_mean_spatial_mean', 'uh_2to5_time_max_ens_std_spatial_mean', 'uh_0to2_time_max_ens_mean_of_90th', 'w_1km_time_max_ens_mean_of_90th', 'hailcast_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_spatial_mean', 'divergence_10m_time_min_ens_std_spatial_mean', 'srh_0to3_en

Feature Importance Experiment:  50%|██████████████████████████████████████████████████████████▌                                                          | 1/2 [50:56<50:56, 3056.34s/it]


 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th', 'low_level_lapse_rate_ens_mean_spatial_mean']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th', 'low_level_lapse_rate_ens_mean_spatial_mean', 'divergence_10m_time_min_ens_mean_of_10th']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'w_up_time_max_ens_mean_of_90th', 'low_level_lapse_rate_ens_mean_spatial_mean', 'divergence_10m_time_min_ens_mean_of_10th', 'major_axis_length']
Unable to convert hyperparam results!

 ['dbz_3to5km_max_time_max_ens_mean_spatial_mean', 'comp_dz_time_max_ens_mean_of_90th', 'w_up_time_max_ens_m

Feature Importance Experiment: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [56:54<00:00, 1707.23s/it]


[None, None]

In [6]:
#func('road_surface', 'reduced', 'unnormalized')