## Paper 2 Experiment. 

Train the WoFS-ML-Severe and Road Surface ML models on random feature subsets similar to the experiment in Covert et al. (2020). 

In [1]:
import sys, os 
from os.path import dirname
path = dirname(dirname(os.getcwd()))
sys.path.insert(0, path)
sys.path.insert(0, '/home/monte.flora/python_packages/scikit-explain')

In [13]:
from sklearnex import patch_sklearn
patch_sklearn()

from src.io.io import load_data_and_model
from skexplain.common.multiprocessing_utils import run_parallel, to_iterator
import numpy as np 
from tqdm import tqdm 
import itertools
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [10]:
N_ITER = 5000
N_JOBS = 30
BASE_PATH = '/work/mflora/explainability_work/'
DATA_BASE_PATH = os.path.join(BASE_PATH, 'datasets')
MODEL_BASE_PATH = os.path.join(BASE_PATH, 'models')
RESULTS_PATH = os.path.join(path, 'results')

In [4]:
def get_feature_subsets(X, n_iter):
    random_state = np.random.RandomState(123)
    features = list(X.columns)
    n_features = random_state.choice(np.arange(2, len(features)), n_iter)
    random_states = [np.random.RandomState(i) for i in random_state.choice(10000, size=len(n_features), 
                                                                           replace=False)]

    inds = [list(random_states[i].choice(features, size=n, replace=False)) for i, n in enumerate(n_features)]

    return inds

In [14]:
datasets = ['tornado', 'severe_wind', 'severe_hail', 'road_surface']
options = ['original', 'reduced']

for dataset, option in itertools.product(datasets, options):

    model, X, y = load_data_and_model(dataset, option, DATA_BASE_PATH, MODEL_BASE_PATH)
    
    if dataset == 'tornado' and option == 'original':
        params = rf_orig.get_params()
    else:
        params = model[1].base_estimator.named_steps['model'].get_params()

    if dataset == 'road_surface':
        params['max_features'] = 'sqrt'
    else:
        params['max_iter'] = 300

    known_skew = np.mean(y)
    inds = get_feature_subsets(X, n_iter=N_ITER)

    def _fit(feature_subset):
        X_train = X[feature_subset]
        if dataset == 'road_surface':
            params['n_jobs'] = 40
            clf = RandomForestClassifier(**params)
        else:
            clf = LogisticRegression(**params)
    
        clf.fit(X_train,y)
        prediction = clf.predict_proba(X_train)[:,1]
        score = average_precision_score(y, prediction)

        return score

    if dataset != 'road_surface':
        scores = run_parallel(_fit, to_iterator(inds), n_jobs=N_JOBS)
    else:
        # Having the random forests trained in parallel was faster than
        # trying 
        scores = []
        for i, feature_subset in tqdm(enumerate(inds), total=N_ITER):
            score =_fit(feature_subset)
            scores.append(score)
    
    scores = np.array(scores)

    results_dict = {
            'subsets' : inds,
            'scores' : scores
            }

    # Save the results. 
    out_fname = os.path.join(RESULTS_PATH, f'{dataset}_{option}__scores.pkl')
    with open(out_fname, 'wb') as f:
        pickle.dump(results_dict, f)