In [2]:
import pandas as pd
import sklearn
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import metrics
from sklearn import model_selection, naive_bayes, svm, metrics
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score


In [3]:
df = pd.read_csv('reports_with_vectors_1000.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112095 entries, 0 to 112094
Columns: 1009 entries, Unnamed: 0 to you
dtypes: float64(1005), int64(1), object(3)
memory usage: 862.9+ MB


In [5]:
df['shape'].value_counts()

light        22762
circle       18504
sphere       16016
other        15607
triangle     10497
oval          7050
cylinder      4404
formation     3456
changing      2716
flash         1963
rectangle     1848
diamond       1650
chevron       1294
cone           468
cross          344
Name: shape, dtype: int64

In [6]:
df = df[pd.notnull(df['shape'])]

In [7]:
cat_cols = ['region']

for col in cat_cols:
    temp = pd.get_dummies(df[col], prefix=col)
    df= pd.concat([df, temp], axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
y= Encoder.fit_transform(df['shape'])

In [9]:
d = dict(zip(df['shape'], y))

In [10]:
d = pd.DataFrame(
sorted(d.items(), key=lambda x: x[1], reverse=False))

In [11]:
d

Unnamed: 0,0,1
0,changing,0
1,chevron,1
2,circle,2
3,cone,3
4,cross,4
5,cylinder,5
6,diamond,6
7,flash,7
8,formation,8
9,light,9


In [12]:
x = df.drop(columns =['shape', 'Unnamed: 0','citystate','region'])

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=1,stratify=y)

In [14]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state=1, stratify=y_train)



In [15]:
lgb_mdl_rand = lgb.LGBMClassifier(boosting_type = 'gbdt',
                                n_jobs = -1,
                                objective = 'multiclass',
                                num_iterations = 3,
                                metric = 'multi_logloss',
                                pos_bagging_fraction = 0.5,
                                  
                               )

lgb_grid_params_rand = { 
    'learning_rate': [1, 3, 5],
    'estimator__max_depth' : [10, 20, 30],
    'num_leaves': [5, 10,15,20],
    
}

lgb_gs_rand = RandomizedSearchCV(lgb_mdl_rand, lgb_grid_params_rand,
                                 n_jobs=1, cv=4,
                                 random_state=2)

fit_params={"early_stopping_rounds" : 10,
           "eval_set" : [[x_val, y_val]]}


In [16]:
lgb_gs_rand.fit(x_train, y_train, **fit_params)

In [17]:
lgb_train_predictions =lgb_gs_rand.predict(x_train)
lgb_test_predictions= lgb_gs_rand.predict(x_test)

train_probs_lgb_r = lgb_gs_rand.predict_proba(x_train)
test_probs_lgb_r = lgb_gs_rand.predict_proba(x_test)


In [18]:
accuracy_score(lgb_train_predictions, y_train)

In [19]:
accuracy_score(lgb_test_predictions, y_test)

In [20]:
from sklearn.metrics import log_loss
log_loss(y_train, train_probs_lgb_r)

In [21]:
log_loss(y_test, test_probs_lgb_r)

In [22]:
lgb_gs_rand.best_params_

In [23]:
pd.crosstab(y_test, lgb_test_predictions,
            rownames=['True'], colnames=['Predicted'], margins=True)

In [24]:
lightgbm_hp_test = lgb.Dataset(x_test, y_test)
lightgbm_hp_train = lgb.Dataset(x_train, y_train)
lightgbm_hp_val = lgb.Dataset(x_val, y_val)

In [25]:
# Sets the space to search over and the prior probabilities over the search space 
import hyperopt as hp
from hyperopt import Trials,fmin,STATUS_OK
lgbm_space = {

# hp.choice.choice will select 1 value from the given list  , 'dart', 'goss', 'rf'
    'boosting_type': hp.hp.choice('boosting_type',  ['gbdt']),
    'num_leaves':hp.hp.choice('num_leaves', np.arange(10, 300,1, dtype=int)),

    'subsample':hp.hp.quniform('subsample',0.5,1.0,0.05),
    'colsample_bytree':hp.hp.quniform('colsample_bytree',0.5,1.0,0.05),
    'min_child_weight':hp.hp.quniform('min_child_weight', 100, 1000,100),
    'reg_alpha': hp.hp.uniform('reg_alpha', 0.0, 1000.0),
    'reg_lambda': hp.hp.uniform('reg_lambda', 0.0, 1000.0),
    'learning_rate': hp.hp.loguniform('learning_rate', -4, 0),
    'feature_fraction': hp.hp.loguniform('feature_fraction', -4, 0),
    'bagging_fraction': hp.hp.loguniform('bagging_fraction', -4, 0),
    'bagging_frequency':hp.hp.choice('bagging_frequency', np.arange(5, 100,1, dtype=int)),
    'drop_rate': hp.hp.loguniform('drop_rate', -4, 0),
    'scale_pos_weight': hp.hp.uniform('scale_pos_weight', 6.0, 10.0),
    
    'num_class' : 1, 
    'metric' : 'auc',
    'nthread': 6, 
    'max_bin': 512
    }

In [26]:
# Here we define an objective (loss) function I take 
def objective_m(params, n_folds=5):


    model = lgb.cv(params = params,
              train_set = lightgbm_hp_train,
              num_boost_round = 10000,
              early_stopping_rounds = 10,
             nfold = n_folds)
  
    
  
    # returns the best average loss on validation set 
    #need loss function; need the 1-auc- because try to maximize; 
    
    loss = 1 - (max(model['auc-mean'])) #/max*modedl['std-mean']to normalize it
    return loss


bayes_trials = Trials()
MAX_EVALS = 20 # this controls the runtime 

lgbm_best_m = fmin(fn = objective_m, space = lgbm_space, algo = hp.tpe.suggest, 
max_evals = MAX_EVALS, trials = bayes_trials)

100%|███████████████████████████████████████████████████| 20/20 [09:08<00:00, 20.42s/it, best loss: 0.3754336195154675]


In [35]:
lgbm_best_m

{'bagging_fraction': 0.2023277887592808,
 'bagging_frequency': 35,
 'boosting_type': 0,
 'colsample_bytree': 0.8,
 'drop_rate': 0.054230323092144664,
 'feature_fraction': 0.5469909816575599,
 'learning_rate': 0.22935726079049248,
 'min_child_weight': 300.0,
 'num_leaves': 137,
 'reg_alpha': 108.80094399619455,
 'reg_lambda': 429.1433341293549,
 'scale_pos_weight': 6.912132053630817,
 'subsample': 0.65}

In [36]:
lgb_clf = lgb.LGBMClassifier(params = lgbm_best_m)

In [37]:
lgb_clf = lgb.LGBMClassifier(
 boosting_type='gbdt',
 bagging_fraction= 0.703289623071987,
 bagging_frequency=92,
 colsample_bytree= 0.8500000000000001,
 drop_rate=0.24815050039128092,
 feature_fraction= 0.9333524600272556,
 learning_rate= 0.851438523524139,
 min_child_weight= 800.0,
 num_leaves= 145,
 reg_alpha=332.1387607329755,
 reg_lambda= 991.3161338134053,
 scale_pos_weight= 8.849542137240574,
 subsample =0.75)


In [38]:
lgb_clf.fit(x_train, y_train)

lgb_train_predictions =lgb_clf.predict(x_train)
lgb_test_predictions= lgb_clf.predict(x_test)

train_probs_lgb_r = lgb_clf.predict_proba(x_train)
test_probs_lgb_r = lgb_clf.predict_proba(x_test)


In [40]:
print('train accuracy: ', accuracy_score(lgb_train_predictions, y_train))
print('test accuracy: ',accuracy_score(lgb_test_predictions, y_test))
from sklearn.metrics import log_loss
print('train logloss: ',log_loss(y_train, train_probs_lgb_r))
print('test logloss: ',log_loss(y_test, test_probs_lgb_r))

train accuracy:  0.3807454309972658
test accuracy:  0.36737889114017314
train logloss:  2.0886163988204793
test logloss:  2.117645602746526


In [34]:
import shap

lgb_clf = lgb.LGBMClassifier(boosting_type = 'gbdt',
                                n_jobs = -1,
                                objective = 'multiclass',
                                num_iterations = 3,
                                metric = 'multi_logloss',
                                pos_bagging_fraction = 0.5,
                    num_leaves= 20, learning_rate= 1, estimator__max_depth=30)

lgb_clf.fit(x_train,y_train)
                
shape_shap_explainer = shap.TreeExplainer(lgb_clf)



In [36]:
shape_shap_vals_train = shape_shap_explainer.shap_values(x_train)

In [37]:
shape_shap_vals_test = shape_shap_explainer.shap_values(x_test)

In [38]:
# We can also plot the variable importance of all of our variables in a summary plot
shap.summary_plot(shape_shap_vals_train, x_train)

MemoryError: 