In [1]:
import os
os.chdir('../../')
os.getcwd()

'/home/maxspad/proj/nlp-qual-max'

In [2]:
import pandas as pd
import mlflow
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
import src.models.train as train
from omegaconf import DictConfig, OmegaConf
from pprint import pprint

## Global Parameters

In [3]:
EXPERIMENT_NAME = '220812_123209_q1_10000t_random'
KEY_METRIC = 'metrics_mean_test_balanced_accuracy'

## Get list of runs

In [4]:
df = mlflow.search_runs(experiment_names=[EXPERIMENT_NAME])
df.columns = [c.replace('.','_') for c in df.columns]
df = df.sort_values(KEY_METRIC, ascending=False)
print(df.shape)

(6929, 119)


In [12]:
df = df.dropna(subset=KEY_METRIC)

## Linear regression for parameter importance
Fit a simple linear regression model on the `KEY_METRIC` (balanced accuracy score) with hyperparams as the 
independent variables. This gives us a rough estimate of hyperparameter importance.

In [13]:
metric_cols = df.filter(regex=r'metrics_mean_').columns
param_cols = df.filter(regex=r'params_').columns

In [14]:
X = df[param_cols].copy()
X = X.replace({'True': 1, 'False': 0})
X = X.drop(['params_mlflow_experiment_name','params_random_seed','params_train_path','params_target_var','params_mlflow_tracking_dir','params_max_iter','params_text_var','params_conda_yaml_path','params_class_weight'], axis=1)
y = df[KEY_METRIC]

mdl = LinearRegression()
pipe = make_pipeline(MinMaxScaler(), mdl)

pipe = pipe.fit(X, y)

### Hyperparameter Importance

In [15]:
param_imp = pd.DataFrame({'feature': pipe.feature_names_in_, 'coef': pipe[-1].coef_})
param_imp.sort_values('coef', inplace=True)
param_imp

Unnamed: 0,feature,coef
5,params_min_df,-0.004492182
12,params_ngram_max,-0.000621143
13,params_lemma,-0.0001343171
2,params_invert_target,-5.2041700000000004e-18
0,params_ngram_min,0.0
1,params_ent_counts,0.001337452
6,params_punct,0.001821434
11,params_token_count,0.001884353
4,params_pron,0.006238946
10,params_pos_counts,0.007369767


## Pare Down Best Model
Attempt to simplify the best model fit while losing as little performance as possible

### Get the best model

In [16]:
best_model = df.iloc[0,:]
print('Best Model Run Id:', best_model['run_id'])
print()
print('Best Model Metrics:')
print(best_model.filter(regex=r'metrics_mean_'))
print()
print('Best Model Paramters:')
print(best_model.filter(regex=r'params_'))

Best Model Run Id: 059651b7006d4e0099fdf410fad12532

Best Model Metrics:
metrics_mean_test_f1_3                 0.751703
metrics_mean_test_f1                   0.625673
metrics_mean_test_rec_2                0.523388
metrics_mean_test_top_3_acc            0.984941
metrics_mean_test_cm_3_1                   16.8
metrics_mean_test_rec_1                0.512903
metrics_mean_test_recall               0.621537
metrics_mean_test_cm_2_1                   30.2
metrics_mean_test_precision            0.641326
metrics_mean_test_rec_0                  0.7375
metrics_mean_test_cm_2_2                   69.2
metrics_mean_test_top_2_acc            0.871529
metrics_mean_test_rec_3                0.712358
metrics_mean_test_prec_1               0.392545
metrics_mean_test_cm_1_3                    6.0
metrics_mean_test_roc_auc                   NaN
metrics_mean_test_cm_0_0                   11.8
metrics_mean_test_accuracy             0.625412
metrics_mean_test_cm_0_1                    3.0
metrics_mean_te

In [28]:
best_model_params = best_model.filter(regex=r'params_')
best_model_params.index = [i.replace('params_','') for i in best_model_params.index]
best_model_params = best_model_params.to_dict()
def safe_eval(v):
    try:
        return eval(v)
    except:
        return v
best_model_params = {k : safe_eval(best_model_params[k]) for k in best_model_params}
conf = DictConfig(content={'train': best_model_params})
conf.train.mlflow_experiment_name = 'q1_hparam_simplification'

### Baseline

In [29]:
pprint(OmegaConf.to_container(conf))

{'train': {'class_weight': 'balanced',
           'conda_yaml_path': './conda.yaml',
           'ent_counts': True,
           'invert_target': True,
           'lemma': True,
           'max_df': 0.8203939665297079,
           'max_iter': 10000,
           'min_df': 6,
           'mlflow_experiment_name': 'q1_hparam_simplification',
           'mlflow_tracking_dir': './mlruns',
           'model_c': 0.01,
           'ngram_max': 7,
           'ngram_min': 1,
           'pos_counts': True,
           'pron': False,
           'punct': False,
           'random_seed': 43,
           'stop': True,
           'target_var': 'Q1',
           'text_var': 'comment_spacy',
           'token_count': True,
           'train_path': 'data/processed/train.pkl',
           'vectors': True}}


In [30]:
bal_acc = train.main(conf)[0]
print(bal_acc)

2022/08/12 14:38:48 INFO mlflow.tracking.fluent: Experiment with name 'q1_hparam_simplification' does not exist. Creating a new experiment.
Target Q1 has 4 levels! Metrics will be multi-level.
Cannot invert a multi-level target! Ignoring


0.6215373560758563


### Simplification

### Q1

In [45]:
simple = conf.copy()
simple.train.ngram_max = 3 # 0.045% IMPROVEMENT
simple.train.max_df = 1.0 # 0.045% IMPROVEMENT
# simple.train.min_df = 1 # 2.73% reduction
# simple.train.lemma = False # 1.36% reduction
simple.train.ent_counts = False # 0.06% IMPROVEMENT
simple.train.token_count = False # 0.13% IMPROVEMENT
# simple.train.pos_counts = False # 0.08% reduction
# simple.train.pron = True # 0.46% reduction
# simple.train.punct = True # 0.46% reduction
# simple.train.stop = False # 3.18% reduction
# simple.train.vectors = False # 

In [46]:
simple_bal_acc = train.main(simple)[0]
pct_reduction = (1.0 - (simple_bal_acc / bal_acc)) * 100.0
print(simple_bal_acc)
print(pct_reduction)

Target Q1 has 4 levels! Metrics will be multi-level.
Cannot invert a multi-level target! Ignoring


0.6223438076887595
-0.12975110908777232


### Q3

In [12]:
simple = conf.copy()
simple.train.ngram_max = 3 # 0.45% reduction
simple.train.max_df = 1.0 # 0.75% reduction
# simple.train.min_df = 1 # 9.08% reduction
# simple.train.lemma = False # 3.19% reduction
simple.train.ent_counts = False # 0.79% reduction
simple.train.token_count = False # 0.79% reduction
# simple.train.pos_counts = True # 0.98% reduction
# simple.train.pron = False # 0.79% reduction
# simple.train.punct = True # 1.09% reduction
# simple.train.stop = True # 1.42% reduction
# simple.train.vectors = False # 4.039% reduction

In [13]:
simple_bal_acc = train.main(simple)[0]
pct_reduction = (1.0 - (simple_bal_acc / bal_acc)) * 100.0

In [14]:
print(simple_bal_acc)
print(f'Percent reduction {pct_reduction}%')

0.7844958782498415
Percent reduction 0.7943154987303358%


### Final Simplified Parameters

In [47]:
pprint(OmegaConf.to_container(simple))

{'train': {'class_weight': 'balanced',
           'conda_yaml_path': './conda.yaml',
           'ent_counts': False,
           'invert_target': True,
           'lemma': True,
           'max_df': 1.0,
           'max_iter': 10000,
           'min_df': 6,
           'mlflow_experiment_name': 'q1_hparam_simplification',
           'mlflow_tracking_dir': './mlruns',
           'model_c': 0.01,
           'ngram_max': 3,
           'ngram_min': 1,
           'pos_counts': True,
           'pron': False,
           'punct': False,
           'random_seed': 43,
           'stop': True,
           'target_var': 'Q1',
           'text_var': 'comment_spacy',
           'token_count': False,
           'train_path': 'data/processed/train.pkl',
           'vectors': True}}
