In [1]:
import os
os.chdir('../../')
os.getcwd()

'/home/maxspad/proj/nlp-qual-max'

In [2]:
import pandas as pd
import mlflow
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
import src.models.train as train
from omegaconf import DictConfig, OmegaConf
from pprint import pprint

## Global Parameters

In [3]:
EXPERIMENT_NAME = '220630_154710_random_trials10000'
KEY_METRIC = 'metrics_mean_test_balanced_accuracy'

## Get list of runs

In [4]:
df = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
df.columns = [c.replace('.','_') for c in df.columns]
df = df.sort_values(KEY_METRIC, ascending=False)
print(df.shape)

(10000, 58)


## Linear regression for parameter importance
Fit a simple linear regression model on the `KEY_METRIC` (balanced accuracy score) with hyperparams as the 
independent variables. This gives us a rough estimate of hyperparameter importance.

In [5]:
metric_cols = df.filter(regex=r'metrics_mean_').columns
param_cols = df.filter(regex=r'params_').columns

In [6]:
X = df[param_cols].copy()
X = X.replace({'True': 1, 'False': 0})
X = X.drop(['params_mlflow_experiment_name','params_random_seed','params_train_path','params_target_var','params_mlflow_tracking_dir','params_max_iter','params_text_var','params_conda_yaml_path','params_class_weight'], axis=1)
y = df[KEY_METRIC]

mdl = LinearRegression()
pipe = make_pipeline(MinMaxScaler(), mdl)

pipe = pipe.fit(X, y)

### Hyperparameter Importance

In [7]:
param_imp = pd.DataFrame({'feature': pipe.feature_names_in_, 'coef': pipe[-1].coef_})
param_imp.sort_values('coef', inplace=True)
param_imp

Unnamed: 0,feature,coef
8,params_lemma,-0.002
11,params_min_df,-0.001638
3,params_ent_counts,-0.000631
5,params_punct,-0.000221
12,params_ngram_min,0.0
9,params_ngram_max,0.000246
2,params_token_count,0.001214
0,params_vectors,0.003294
7,params_pron,0.003604
4,params_pos_counts,0.006164


## Pare Down Best Model
Attempt to simplify the best model fit while losing as little performance as possible

### Get the best model

In [8]:
best_model = df.iloc[0,:]
print('Best Model Run Id:', best_model['run_id'])
print()
print('Best Model Metrics:')
print(best_model.filter(regex=r'metrics_mean_'))
print()
print('Best Model Paramters:')
print(best_model.filter(regex=r'params_'))

Best Model Run Id: 31003435631a414491775ffe99a6a377

Best Model Metrics:
metrics_mean_test_f1                   0.906983
metrics_mean_test_accuracy             0.852706
metrics_mean_test_fn                       36.8
metrics_mean_test_fp                       25.8
metrics_mean_test_tp                       57.2
metrics_mean_test_recall               0.892398
metrics_mean_test_tn                      305.2
metrics_mean_test_balanced_accuracy    0.790777
metrics_mean_test_precision            0.922176
metrics_mean_fit_time                  0.213948
metrics_mean_test_roc_auc              0.862157
metrics_mean_score_time                0.102356
Name: 3194, dtype: object

Best Model Paramters:
params_train_path                        data/processed/train.pkl
params_vectors                                               True
params_mlflow_tracking_dir                               ./mlruns
params_model_c                                               0.01
params_text_var                       

In [9]:
best_model_params = best_model.filter(regex=r'params_')
best_model_params.index = [i.replace('params_','') for i in best_model_params.index]
best_model_params = best_model_params.to_dict()
def safe_eval(v):
    try:
        return eval(v)
    except:
        return v
best_model_params = {k : safe_eval(best_model_params[k]) for k in best_model_params}
conf = DictConfig(content={'train': best_model_params})
conf.train.mlflow_experiment_name = 'hparam_simplification'

### Baseline

In [10]:
pprint(OmegaConf.to_container(conf))

{'train': {'class_weight': 'balanced',
           'conda_yaml_path': './conda.yaml',
           'ent_counts': True,
           'lemma': True,
           'max_df': 0.18723932735537868,
           'max_iter': 10000,
           'min_df': 15,
           'mlflow_experiment_name': 'hparam_simplification',
           'mlflow_tracking_dir': './mlruns',
           'model_c': 0.01,
           'ngram_max': 1,
           'ngram_min': 1,
           'pos_counts': False,
           'pron': True,
           'punct': False,
           'random_seed': 43,
           'stop': False,
           'target_var': 'Q2',
           'text_var': 'comment_spacy',
           'token_count': True,
           'train_path': 'data/processed/train.pkl',
           'vectors': True}}


In [11]:
bal_acc = train.main(conf)[0]
print(bal_acc)

0.7907771436623687




### Simplification

In [12]:
simple = conf.copy()
simple.train.ngram_max = 3 # 0.45% reduction
simple.train.max_df = 1.0 # 0.75% reduction
# simple.train.min_df = 1 # 9.08% reduction
# simple.train.lemma = False # 3.19% reduction
simple.train.ent_counts = False # 0.79% reduction
simple.train.token_count = False # 0.79% reduction
# simple.train.pos_counts = True # 0.98% reduction
# simple.train.pron = False # 0.79% reduction
# simple.train.punct = True # 1.09% reduction
# simple.train.stop = True # 1.42% reduction
# simple.train.vectors = False # 4.039% reduction

In [13]:
simple_bal_acc = train.main(simple)[0]
pct_reduction = (1.0 - (simple_bal_acc / bal_acc)) * 100.0

In [14]:
print(simple_bal_acc)
print(f'Percent reduction {pct_reduction}%')

0.7844958782498415
Percent reduction 0.7943154987303358%


### Final Simplified Parameters

In [15]:
pprint(OmegaConf.to_container(simple))

{'train': {'class_weight': 'balanced',
           'conda_yaml_path': './conda.yaml',
           'ent_counts': False,
           'lemma': True,
           'max_df': 1.0,
           'max_iter': 10000,
           'min_df': 15,
           'mlflow_experiment_name': 'hparam_simplification',
           'mlflow_tracking_dir': './mlruns',
           'model_c': 0.01,
           'ngram_max': 3,
           'ngram_min': 1,
           'pos_counts': False,
           'pron': True,
           'punct': False,
           'random_seed': 43,
           'stop': False,
           'target_var': 'Q2',
           'text_var': 'comment_spacy',
           'token_count': False,
           'train_path': 'data/processed/train.pkl',
           'vectors': True}}
