In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from event_keywords import event_keywords
from hyperopt import hp, tpe, fmin, Trials

Read in processed data sources and join

In [2]:
df = pd.read_pickle('./data/tokenized.pkl')
df.head(5)

Unnamed: 0,text,sex,age,event,text_edit,token_len
0,57YOM WITH CONTUSION TO FACE AFTER STRIKING IT...,1,57,62,"[contus, face, strike, post, pounder, set, fen...",8
1,A 45YOM FELL ON ARM WHILE WORKING HAD SLIPPED ...,1,45,42,"[fell, arm, work, slip, water, fx, wrist]",7
2,58YOM WITH CERVICAL STRAIN BACK PAIN S P REST...,1,58,26,"[cervic, strain, back, pain, p, restrain, taxi...",16
3,33 YOM LAC TO HAND FROM A RAZOR KNIFE,1,33,60,"[lac, hand, razor, knife]",4
4,53YOM AT WORK IN A WAREHOUSE DOING UNSPECIFIED...,1,53,71,"[work, warehous, unspecifi, lift, strain, lo, ...",8


In [3]:
events = sorted(list(df['event'].drop_duplicates()))

In [4]:
df_keras = pd.read_pickle('data/OOS_keras_preds.pkl')
df_keras.head()

Unnamed: 0,10_prob,11_prob,12_prob,13_prob,20_prob,21_prob,22_prob,23_prob,24_prob,25_prob,...,67_pred,69_pred,70_pred,71_pred,72_pred,73_pred,74_pred,78_pred,79_pred,99_pred
0,4.552343e-12,0.00032,0.000874,4.493075e-05,0.000102589,0.00018586,0.0001986107,2.046019e-06,0.002806777,0.0003718287,...,0,0,0,0,0,0,0,0,0,0
1,1.644099e-16,7e-06,2.5e-05,8.165185e-10,1.285994e-08,8.758091e-07,6.36392e-06,3.621148e-06,1.746725e-06,1.298285e-05,...,0,0,0,0,0,0,0,0,0,0
2,1.983944e-12,0.000206,0.001162,8.482361e-06,1.136537e-06,7.493054e-07,9.664043e-05,4.745345e-06,0.0002491186,5.114411e-07,...,0,0,0,0,0,0,0,0,0,0
3,1.031983e-12,1.2e-05,5e-06,3.102226e-07,1.351664e-06,7.618328e-08,5.062548e-07,1.509698e-10,2.811417e-05,2.878443e-06,...,0,0,0,0,0,0,0,0,0,0
4,8.676676e-13,4e-06,0.000117,6.383927e-07,2.285017e-06,7.146087e-07,1.083008e-06,9.808657e-10,3.01766e-07,1.208545e-06,...,0,0,0,1,0,0,0,0,0,0


In [5]:
prob_cols = [str(e)+'_prob' for e in events]
pred_cols = [str(e)+'_pred' for e in events]

In [6]:
df_keywords = pd.read_pickle('data/event_indicators.pkl')
df_keywords.head()

Unnamed: 0,11_ind,12_ind,13_ind,23_ind,24_ind,25_ind,26_ind,27_ind,31_ind,32_ind,...,55_ind,60_ind,62_ind,63_ind,64_ind,66_ind,70_ind,71_ind,72_ind,73_ind
0,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,1,0,0,0,...,1,0,1,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [7]:
event_cols = [k+'_ind' for k in event_keywords.keys() if len(event_keywords[k]) > 0]
print(event_cols)

['11_ind', '12_ind', '13_ind', '23_ind', '24_ind', '25_ind', '26_ind', '27_ind', '31_ind', '32_ind', '41_ind', '42_ind', '43_ind', '44_ind', '51_ind', '52_ind', '53_ind', '55_ind', '60_ind', '62_ind', '63_ind', '64_ind', '66_ind', '70_ind', '71_ind', '72_ind', '73_ind']


In [8]:
df.columns.values

array(['text', 'sex', 'age', 'event', 'text_edit', 'token_len'],
      dtype=object)

In [9]:
dfx = pd.concat([df[['sex', 'age','token_len']],df_keywords,df_keras],axis=1)
print(dfx.shape)
dfx.head()

(153956, 126)


Unnamed: 0,sex,age,token_len,11_ind,12_ind,13_ind,23_ind,24_ind,25_ind,26_ind,...,67_pred,69_pred,70_pred,71_pred,72_pred,73_pred,74_pred,78_pred,79_pred,99_pred
0,1,57,8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,45,7,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,58,16,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,33,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,53,8,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
dfy = df[['event']]

In [11]:
del df,df_keywords,df_keras

### Lightgbm model

Use same multi-class target format we used with keras

In [12]:
encoder = LabelEncoder()

In [19]:
encoder.fit(dfy.values.ravel())

LabelEncoder()

In [20]:
yencoded = encoder.transform(dfy.values.ravel())

Create train validation split

In [125]:
x_train, x_test, y_train, y_test = train_test_split(dfx,yencoded,
                                                    test_size=0.2,random_state=42)

In [161]:
x_train, x_test, y_train, y_test = train_test_split(dfx,yencoded,
                                                    test_size=0.2,random_state=1)

In [None]:
lgbtrain = lgb.Dataset(x_train, y_train)
lgbeval = lgb.Dataset(x_test, y_test, reference=lgbtrain)

In [142]:
params = {
'objective': 'multiclass',
'metric': {'multi_logloss','multi_error'},
'num_class':48,
'num_leaves': 31,
'learning_rate': 0.01,
'feature_fraction': 0.9,
'bagging_fraction': 0.7,
'bagging_freq': 1,
# 'verbose': 1,
}

In [169]:
params = {
'objective': 'multiclass',
'metric': {'multi_error'},
'num_class':48,
'num_leaves': 31,
# 'num_leaves': 124,
'learning_rate': 0.01,
'feature_fraction': 0.8,
'bagging_fraction': 0.5,
'bagging_freq': 1,
# 'verbose': 1,
}

In [172]:
lgbm = lgb.train(params,lgbtrain,num_boost_round=2000,
                 valid_sets=[lgbtrain,lgbeval],verbose_eval=50,early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds.
[50]	training's multi_error: 0.204086	valid_1's multi_error: 0.220934
[100]	training's multi_error: 0.1792	valid_1's multi_error: 0.201254
[150]	training's multi_error: 0.174385	valid_1's multi_error: 0.197779
[200]	training's multi_error: 0.171089	valid_1's multi_error: 0.196967
[250]	training's multi_error: 0.167874	valid_1's multi_error: 0.195798
[300]	training's multi_error: 0.165227	valid_1's multi_error: 0.195505
[350]	training's multi_error: 0.162434	valid_1's multi_error: 0.195246
[400]	training's multi_error: 0.160331	valid_1's multi_error: 0.195538
Early stopping, best iteration is:
[342]	training's multi_error: 0.16297	valid_1's multi_error: 0.195213


In [173]:
accuracy_score(y_test,lgbm.predict(x_test).argmax(1))

0.804786957651338

### Hyperopt

In [145]:
dfx.columns.values

array(['sex', 'age', 'token_len', '11_ind', '12_ind', '13_ind', '23_ind',
       '24_ind', '25_ind', '26_ind', '27_ind', '31_ind', '32_ind',
       '41_ind', '42_ind', '43_ind', '44_ind', '51_ind', '52_ind',
       '53_ind', '55_ind', '60_ind', '62_ind', '63_ind', '64_ind',
       '66_ind', '70_ind', '71_ind', '72_ind', '73_ind', '10_prob',
       '11_prob', '12_prob', '13_prob', '20_prob', '21_prob', '22_prob',
       '23_prob', '24_prob', '25_prob', '26_prob', '27_prob', '29_prob',
       '30_prob', '31_prob', '32_prob', '40_prob', '41_prob', '42_prob',
       '43_prob', '44_prob', '45_prob', '49_prob', '50_prob', '51_prob',
       '52_prob', '53_prob', '54_prob', '55_prob', '56_prob', '59_prob',
       '60_prob', '61_prob', '62_prob', '63_prob', '64_prob', '65_prob',
       '66_prob', '67_prob', '69_prob', '70_prob', '71_prob', '72_prob',
       '73_prob', '74_prob', '78_prob', '79_prob', '99_prob', '10_pred',
       '11_pred', '12_pred', '13_pred', '20_pred', '21_pred', '22_pred',


In [146]:
train_cols = ['sex', 'age', 'token_len'] + prob_cols + event_cols
x_train, x_test, y_train, y_test = train_test_split(dfx[train_cols],yencoded,
                                                    test_size=0.2,random_state=42)

In [149]:
lgbtrain = lgb.Dataset(x_train, y_train)
lgbeval = lgb.Dataset(x_test, y_test, reference=lgbtrain)

In [150]:
params = {
    'objective': 'multiclass',
    'metric': {'multi_error'},
    'num_class':48,
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'verbose':-1
    }

In [155]:
def objective(args):
    # param updates
    params['num_leaves'] = args['num_leaves']
    params['learning_rate'] = args['learning_rate']
    params['feature_fraction'] = args['feature_fraction']
    params['bagging_fraction'] = args['bagging_fraction']
    params['min_data_in_leaf'] = args['min_data_in_leaf']
    lgbm = lgb.train(params,lgbtrain,
                     num_boost_round=500,
                 valid_sets=lgbeval,
                     verbose_eval=-1,
                     early_stopping_rounds=20)
    return 1 - accuracy_score(y_test,lgbm.predict(x_test).argmax(1))

In [156]:
space = dict([('learning_rate', hp.loguniform('learning_rate', 
                                                     np.log(0.0005), np.log(0.05))),
                    ('num_leaves', hp.choice('num_leaves', range(10, 100, 10))),
                    ('min_data_in_leaf', hp.choice('min_data_in_leaf', 
                                                  range(100, 1000, 100))),
                    ('feature_fraction', hp.uniform('feature_fraction', 0.5, 1.0)),
                    ('bagging_fraction', hp.uniform('subsample', 0.5, 1.0))
                    ])

In [157]:
trials = Trials()

In [None]:
best = fmin(objective, space, trials=trials, algo=tpe.suggest, max_evals=100)

Training until validation scores don't improve for 20 rounds.
Early stopping, best iteration is:                   
[139]	valid_0's multi_error: 0.200377
Training until validation scores don't improve for 20 rounds.                    
Did not meet early stopping. Best iteration is:                                  
[497]	valid_0's multi_error: 0.210249
Training until validation scores don't improve for 20 rounds.                    
Did not meet early stopping. Best iteration is:                                  
[500]	valid_0's multi_error: 0.223824
Training until validation scores don't improve for 20 rounds.                     
Early stopping, best iteration is:                                                
[260]	valid_0's multi_error: 0.205346
Training until validation scores don't improve for 20 rounds.                     
Early stopping, best iteration is:                                               
[286]	valid_0's multi_error: 0.205346
Training until validation scores do

Early stopping, best iteration is:                                                  
[203]	valid_0's multi_error: 0.202715
Training until validation scores don't improve for 20 rounds.                       
Early stopping, best iteration is:                                                  
[283]	valid_0's multi_error: 0.202065
Training until validation scores don't improve for 20 rounds.                       
Early stopping, best iteration is:                                                  
[150]	valid_0's multi_error: 0.204761
Training until validation scores don't improve for 20 rounds.                       
Early stopping, best iteration is:                                                  
[116]	valid_0's multi_error: 0.202098
Training until validation scores don't improve for 20 rounds.                       
Early stopping, best iteration is:                                                  
[417]	valid_0's multi_error: 0.211419
Training until validation scores don't improv

In [125]:
trials.results

[{'status': 'new'},
 {'loss': 0.2220057157703299, 'status': 'ok'},
 {'loss': 0.20469602494154326, 'status': 'ok'},
 {'loss': 0.2036243180046765, 'status': 'ok'},
 {'loss': 0.2000194855806703, 'status': 'ok'},
 {'loss': 0.20537802026500385, 'status': 'ok'},
 {'loss': 0.23372954014029623, 'status': 'ok'},
 {'loss': 0.20820342946219794, 'status': 'ok'},
 {'loss': 0.27773447648739935, 'status': 'ok'},
 {'loss': 0.2021628994544037, 'status': 'ok'},
 {'loss': 0.2006365289685632, 'status': 'ok'},
 {'loss': 0.19894777864380353, 'status': 'ok'},
 {'loss': 0.2350935307872175, 'status': 'ok'},
 {'loss': 0.20567030397505848, 'status': 'ok'},
 {'loss': 0.32063522992985194, 'status': 'ok'},
 {'loss': 0.21797869576513385, 'status': 'ok'},
 {'loss': 0.20420888542478566, 'status': 'ok'},
 {'loss': 0.2006690049363471, 'status': 'ok'},
 {'loss': 0.23324240062353863, 'status': 'ok'},
 {'loss': 0.2056378280072746, 'status': 'ok'},
 {'loss': 0.20209794751883603, 'status': 'ok'},
 {'loss': 0.2030722265523512

In [126]:
trials.idxs_vals

({'feature_fraction': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30],
  'learning_rate': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30],
  'min_data_in_leaf': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30],
  'num_leaves': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30],
  'subsample': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   

In [124]:
tpe_results = pd.DataFrame({'loss': np.round([x['loss'] for x in trials.results],3), 
                            'feature_fraction': np.round(trials.idxs_vals[1]['feature_fraction'],2),
                            'learning_rate': np.round(trials.idxs_vals[1]['learning_rate'],4),
                            'min_data_in_leaf': trials.idxs_vals[1]['min_data_in_leaf'],
                            'num_leaves': trials.idxs_vals[1]['num_leaves'],
                            'subsample': np.round(trials.idxs_vals[1]['subsample'],2),
                           })

KeyError: 'loss'

In [123]:
tpe_results

Unnamed: 0,loss,feature_fraction,learning_rate,min_data_in_leaf,num_leaves,subsample
0,0.705,0.35,0.0059,414,65,0.53
1,0.832,0.7,0.0007,49,91,0.38
2,0.265,0.57,0.0662,797,82,0.26
3,0.512,0.67,0.0184,762,77,0.14
4,0.232,1.0,0.1283,346,18,0.99
5,0.358,0.2,0.0428,23,8,0.85
6,0.339,0.14,0.4479,421,60,0.91
7,0.547,0.87,0.0171,959,5,0.72
8,0.269,0.73,0.0356,786,79,0.2
9,0.39,0.32,0.0265,619,27,0.84


In [107]:
trials.idxs_vals[1]

{'feature_fraction': [0.35210393572331,
  0.7002177523515323,
  0.5713359951349863,
  0.6684530595040203,
  0.9990607832032512,
  0.20294354945207738,
  0.1394919808261862,
  0.8684331133888737,
  0.7285455603094689,
  0.31967438991522756],
 'learning_rate': [0.005883690501327409,
  0.0007448534397700522,
  0.06620960451234696,
  0.018374030155032327,
  0.12831422181399435,
  0.04284313340780871,
  0.447937852411587,
  0.01707159050982002,
  0.03558712040482717,
  0.02650223543308471],
 'min_data_in_leaf': [414, 49, 797, 762, 346, 23, 421, 959, 786, 619],
 'num_leaves': [65, 91, 82, 77, 18, 8, 60, 5, 79, 27],
 'subsample': [0.5266841916844415,
  0.3808931059262253,
  0.2595668161088708,
  0.143733536300701,
  0.9901022166394045,
  0.8513870905741188,
  0.9132541570507843,
  0.7154978772239294,
  0.2033319340468047,
  0.8413789214845621]}

In [63]:
sp = hp.uniform('x',0,10)

In [73]:
import hyperopt

In [101]:
hyperopt.pyll.stochastic.sample(sp)

4.401432209829791