# Model

RF Baseline: 2.25/

## Ideas
- tune xgb

In [1]:
import timeit

import pandas as pd
import numpy as np

from skater.model import InMemoryModel
from skater.core.explanations import Interpretation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Set Env fo XGBoost
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

seed=16

## Utility Functions

In [2]:
def run_estimators(estimators, X_train, y_train, X_test, y_test, scoring, cv=5, verbose=False):

    scores = []
    
    for clf in estimators:
        print("Running {0}...".format(clf.__class__.__name__))
        start_time = timeit.default_timer()
        clf.fit(X_train, y_train)
        elapsed = timeit.default_timer() - start_time
        print('Time Elapsed: ' + str(elapsed))
                
        test_scores = cross_val_score(clf, X_test, y_test, scoring=scoring, cv=cv)
        mean_test_cv_score = np.mean(test_scores)
        std_def_test_cv_score = np.std(test_scores)
        
        if verbose:
            train_scores = cross_val_score(clf, X_train, y_train, scoring=scoring, cv=cv)
            print("Mean Train CV Score: {0}".format(np.mean(train_scores)))
            print("Std. Dev. Train CV Score: {0}".format(np.std(train_scores)))
            print("Mean Test CV Score: {0}".format(mean_test_cv_score))
            print("Std. Dev. Test CV Score: {0}".format(std_dev_test_cv_score))  
        
        scores.append((clf.__class__.__name__ + " " + str(clf.get_params()), mean_test_cv_score, std_def_test_cv_score))
        
    return sorted(scores, key=lambda x: x[1], reverse=True)


def select_best_subset(model, X_train, y_train, X_test, y_test, fi, scoring, cv=5, early_stopping=25):
    scores = [] # (k, cv_score_mean, cv_score_std)
    best_score = -10000000
    stop_count = 0
    
    for i in range(1, len(fi)+1):
        # Train
        test_features = fi[:i]
        model.fit(X_train[test_features], y_train)
        
        # CV Test
        test_scores = cross_val_score(model, X_test[test_features], y_test, scoring=scoring, cv=cv)
        mean_test_cv_score = np.mean(test_scores)
        std_def_test_cv_score = np.std(test_scores)
        
        print(f'K: {i} \t=> {mean_test_cv_score}, {std_def_test_cv_score}')
        
        scores.append((i, mean_test_cv_score, std_def_test_cv_score))
        
        # Early Stopping Check
        if not mean_test_cv_score > best_score:
            stop_count += 1
        else:
            best_score = mean_test_cv_score
            stop_count = 0
            
        if stop_count >= early_stopping:
            print("Stopping Early")
            break
        
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    k_best, mean_score, std_score = scores[0]
    print(f'Best K: {k_best} \t=> {mean_score}, {std_score}')
    
    return k_best, mean_score, std_score


def missing_percentage(df):
    '''Calculates the percentage of NaNs in a DataFrame
    '''
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
    return missing_data

## Data

In [3]:
df = pd.read_csv('../data/interim/training_fe.csv')
df_test = pd.read_csv('../data/interim/testing_fe.csv')

In [4]:
df.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,distmod,mwebv,target,hostgal_z,...,flux_by_flux_ratio_sq_max,flux_by_flux_ratio_sq_min,flux_by_flux_ratio_sq_skew,flux_by_flux_ratio_sq_kurtosis,flux_by_flux_ratio_sq_sum,flux_diff,flux_diff_mean,flux_w_mean,flux_diff_w_mean,mjd_det_diff
0,615,349.046051,-61.943836,320.79653,-51.753706,1,31.9961,0.017,92,0.0,...,15002050.0,-29634730.0,-1.414322,2.488978,-960176600.0,1761.066406,-14.306331,-327.742307,-5.373326,873.7903
1,713,53.085938,-27.784405,223.525509,-54.460748,1,45.4063,0.007,88,1.8181,...,735.5214,-2368.542,-3.454554,17.365892,-28750.87,29.506064,-20.730002,-4.884564,-6.040676,873.7903
2,730,33.574219,-6.579593,170.455585,-61.548219,1,40.2561,0.021,42,0.232,...,13065.87,-72.61569,5.989138,37.863683,104650.2,66.46987,29.315018,25.37311,2.619697,873.7903
3,745,0.189873,-45.586655,328.254458,-68.969298,1,40.7951,0.007,90,0.3037,...,4834338.0,-42.69851,11.141069,133.621827,14391250.0,236.289675,26.521968,152.835617,1.546038,873.7903
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,40.4166,0.024,90,0.1934,...,532455.9,-46.13587,7.908174,63.414259,3015599.0,160.143942,22.411225,87.85639,1.822792,873.7903


## Preprocessing

In [5]:
print(missing_percentage(df))

print(missing_percentage(df_test))

                  Total  Percent
mjd_det_diff          0      0.0
4_max                 0      0.0
2_median              0      0.0
3_median              0      0.0
4_median              0      0.0
5_median              0      0.0
0_std                 0      0.0
1_std                 0      0.0
2_std                 0      0.0
3_std                 0      0.0
4_std                 0      0.0
5_std                 0      0.0
0_max                 0      0.0
1_max                 0      0.0
2_max                 0      0.0
3_max                 0      0.0
5_max                 0      0.0
flux_diff_w_mean      0      0.0
0_min                 0      0.0
1_min                 0      0.0
2_min                 0      0.0
3_min                 0      0.0
4_min                 0      0.0
5_min                 0      0.0
0_skew                0      0.0
1_skew                0      0.0
2_skew                0      0.0
3_skew                0      0.0
4_skew                0      0.0
5_skew    

In [6]:
# Drop unneeded features
df = df.drop(['object_id', 'ddf'], axis=1)
test_object_id = df_test['object_id']
df_test = df_test.drop(['object_id', 'ddf'], axis=1)

In [7]:
y = df['target']
X = df.drop(['target'], axis=1)

In [8]:
# Standardize
col_names = X.columns
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=col_names)
df_test = pd.DataFrame(StandardScaler().fit_transform(df_test), columns=col_names)  # redo when you have test data

In [9]:
# Test, Train Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

## Test Various Models

In [10]:
# Class Weights
w = y.value_counts()
weights = {i : np.sum(w) / w[i] for i in w.index}

weights

{90: 3.3929961089494163,
 42: 6.57837384744342,
 65: 8.0,
 16: 8.493506493506494,
 15: 15.854545454545455,
 62: 16.214876033057852,
 88: 21.21081081081081,
 92: 32.83682008368201,
 67: 37.73076923076923,
 52: 42.885245901639344,
 95: 44.84571428571429,
 6: 51.973509933774835,
 64: 76.94117647058823,
 53: 261.6}

In [11]:
# Best Params pulled from https://www.kaggle.com/rooshroosh/forked-dart-w-ideas-from-kernels-and-discuss

best_params = {
            'device': 'cpu', 
            'objective': 'multiclass', 
            'num_class': 14, 
            'boosting_type': 'gbdt', 
            'n_jobs': -1, 
            'max_depth': 7, 
            'n_estimators': 500, 
            'subsample_freq': 2, 
            'subsample_for_bin': 5000, 
            'min_data_per_group': 100, 
            'max_cat_to_onehot': 4, 
            'cat_l2': 1.0, 
            'cat_smooth': 59.5, 
            'max_cat_threshold': 32, 
            'metric_freq': 10, 
            'verbosity': -1, 
            'metric': 'multi_logloss', 
            'xgboost_dart_mode': False, 
            'uniform_drop': False, 
            'colsample_bytree': 0.5, 
            'drop_rate': 0.173, 
            'learning_rate': 0.0267, 
            'max_drop': 5, 
            'min_child_samples': 10, 
            'min_child_weight': 100.0, 
            'min_split_gain': 0.1, 
            'num_leaves': 7, 
            'reg_alpha': 0.1, 
            'reg_lambda': 0.00023, 
            'skip_drop': 0.44, 
            'subsample': 0.75}

In [12]:
models = [
    # Tree
    #DecisionTreeClassifier(random_state=seed),
    # Ensembles
    #AdaBoostClassifier(random_state=seed),
    #RandomForestClassifier(n_estimators=100, random_state=seed),
    XGBClassifier(random_state=seed, sample_weight=y_train.map(weights), **best_params),
    #KNN
    #KNeighborsClassifier(n_neighbors=2),
    #KNeighborsClassifier(n_neighbors=4),
    #KNeighborsClassifier(n_neighbors=8),
    #KNeighborsClassifier(n_neighbors=16),
    #KNeighborsClassifier(n_neighbors=32),
    #KNeighborsClassifier(n_neighbors=64),
    #KNeighborsClassifier(n_neighbors=128),
    #KNeighborsClassifier(n_neighbors=256),
    #KNeighborsClassifier(n_neighbors=512),
]

scores = run_estimators(models, X_train, y_train, X_test, y_test, scoring="neg_log_loss")
best_score = np.round(scores[0][1], 4)

print("\nBest Scores:")
for clf, mean_score, std_score in scores:
    print(f'{clf}\nf{mean_score} (+/- {std_score})')

Running XGBClassifier...
Time Elapsed: 10.642307027010247

Best Scores:
XGBClassifier {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.0267, 'max_delta_step': 0, 'max_depth': 7, 'min_child_weight': 100.0, 'missing': None, 'n_estimators': 500, 'n_jobs': -1, 'nthread': None, 'objective': 'multi:softprob', 'random_state': 16, 'reg_alpha': 0.1, 'reg_lambda': 0.00023, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 0.75, 'sample_weight': 4761     6.578374
1709    37.730769
2685     8.493506
4392     8.493506
2312     8.000000
5472    37.730769
6895    15.854545
3064     6.578374
5926     8.000000
2699    42.885246
5184    32.836820
2985    21.210811
3984    51.973510
6364     3.392996
341      3.392996
1821     8.000000
5267     8.493506
2745     3.392996
2785    37.730769
4962    16.214876
5471    32.836820
2469    42.885246
1933     6.578374
6990     6.578374
2554     3.392996
2300    42.885246
4238 

## Feature Importances

In [13]:
# Tree Model Feature Importances (if tree)

best_model = XGBClassifier(random_state=seed, sample_weight=y_train.map(weights), **best_params)
best_model.fit(X_train, y_train)

for feature, importance in sorted(zip(X.columns, best_model.feature_importances_), key=lambda x: x[1], reverse=True):
    print(feature, importance)

hostgal_z 0.093151726
distmod 0.05000476
flux_ratio_sq_median 0.0268597
2_median 0.025335746
2_skew 0.023811791
flux_kurtosis 0.023335556
1_median 0.021906849
1_std 0.021621106
5_mean 0.02114487
flux_diff_mean 0.021049624
flux_by_flux_ratio_sq_skew 0.020668635
0_std 0.018763691
5_max 0.016668255
0_mean 0.01647776
flux_diff_w_mean 0.01647776
flux_skew 0.014858558
1_mean 0.014572817
5_std 0.014572817
3_skew 0.01447757
0_skew 0.014191828
1_kurtosis 0.0138108395
0_max 0.013620345
4_skew 0.013144109
1_min 0.012477378
1_skew 0.012477378
2_max 0.012191637
3_kurtosis 0.012191637
4_kurtosis 0.011715402
2_std 0.01142966
flux_by_flux_ratio_sq_min 0.011334413
4_mean 0.011239165
3_max 0.010381941
5_skew 0.010191447
1_max 0.0100009525
4_std 0.009715211
detected 0.009619963
5_median 0.009334222
flux_by_flux_ratio_sq_median 0.009048481
0_median 0.00876274
flux_by_flux_ratio_sq_kurtosis 0.00876274
3_std 0.008667492
4_median 0.008191256
0_kurtosis 0.0080960095
2_kurtosis 0.008000762
3_min 0.0078102676
4

In [14]:
# Skater Model Feature Importances

model = InMemoryModel(best_model.predict_proba, examples = X)
interpreter = Interpretation()
interpreter.load_data(X)
fi = interpreter.feature_importance.feature_importance(model, ascending=False)

fi

[129/129] features ████████████████████ Time elapsed: 56 seconds

hostgal_z                     0.179904
distmod                       0.053758
flux_by_flux_ratio_sq_skew    0.045722
flux_diff_mean                0.035114
flux_kurtosis                 0.030482
flux_ratio_sq_median          0.030214
2_median                      0.024715
2_skew                        0.021291
flux_skew                     0.020624
0_mean                        0.020197
5_mean                        0.017462
1_kurtosis                    0.016203
1_std                         0.015946
4_kurtosis                    0.015878
1_median                      0.015848
3_kurtosis                    0.015296
0_skew                        0.014764
0_std                         0.014700
flux_diff_w_mean              0.014583
5_max                         0.014466
3_skew                        0.014128
5_std                         0.013625
2_max                         0.013613
1_mean                        0.013159
0_max                         0.012382
flux_by_flux_ratio_sq_min

In [15]:
# Get about a .03 local jump with kbest, However Kbest minimizes kaggle score

# k_best, best_score, _ = select_best_subset(XGBClassifier(random_state=seed),
#                                            X_train, y_train,
#                                            X_test, y_test,
#                                            fi.index.values, # Just a list of features sorted by importance
#                                            scoring="neg_log_loss")

In [16]:
best_model = XGBClassifier(random_state=seed, sample_weight=y.map(weights), **best_params)
# best_model.fit(X[fi.index.values[:k_best]], y)  # do I do a full fit here?
best_model.fit(X, y)  # do I do a full fit here?

XGBClassifier(base_score=0.5, booster='gbtree', boosting_type='gbdt',
       cat_l2=1.0, cat_smooth=59.5, colsample_bylevel=1,
       colsample_bytree=0.5, device='cpu', drop_rate=0.173, gamma=0,
       learning_rate=0.0267, max_cat_threshold=32, max_cat_to_onehot=4,
       max_delta_step=0, max_depth=7, max_drop=5, metric='multi_logloss',
       metric_freq=10, min_child_samples=10, min_child_weight=100.0,
       min_data_per_group=100, min_split_gain=0.1, missing=None,
       n_estimators=500, n_jobs=-1, nthread=None, num_class=14,
       num_leaves=7, objective='multi:softprob', random_state=16,
       reg_alpha=0.1, reg_lambda=0.00023,
       sample_weight=0       32.836820
1       21.210811
2        6.578374
3        3.392996
4        3.392996
5        8.000000
6        3.392996
7        6.578374
8        3.392996
9        8.000000
10       3.392996
11       6.578374
12       6.578374
13       3.392996
14       8.000000
15       8.49350...996
7845     8.493506
7846     8.000000
78

## Generate Submission

In [17]:
# Generate Test Pred and Ouput
# preds = best_model.predict_proba(df_test[fi.index.values[:k_best]])
preds = best_model.predict_proba(df_test)

In [18]:
## Class_99
# Let's try just .1
class_99_prob = .1

class_99 = np.full((preds.shape[0], 1), class_99_prob)
total_class = np.hstack((preds, class_99))
total_class /= np.sum(total_class, axis=1).reshape((-1, 1))

test_object_id_values = test_object_id.values.reshape((-1, 1))
output_vals = np.hstack((test_object_id_values, total_class))

output_cols = np.array(['object_id'] + ["class_" + str(x) for x in sorted(np.unique(y)) + [99]])

sub = pd.DataFrame(output_vals, columns=output_cols)
sub['object_id'] = sub['object_id'].astype(int)
file_name = f'submission_{best_model.__class__.__name__}_{best_score}'
sub.to_csv(f'../data/submissions/{file_name}.csv', index=False)

print(file_name)

sub.head()

submission_XGBClassifier_-1.4849


Unnamed: 0,object_id,class_6,class_15,class_16,class_42,class_52,class_53,class_62,class_64,class_65,class_67,class_88,class_90,class_92,class_95,class_99
0,13,0.015396,0.160802,0.007699,0.179919,0.108278,0.010138,0.145189,0.024936,0.009216,0.033178,0.004542,0.171062,0.011267,0.02747,0.090909
1,14,0.002768,0.536641,0.00311,0.0359,0.036306,0.00322,0.01392,0.007314,0.007196,0.00966,0.0037,0.240618,0.004019,0.004719,0.090909
2,17,0.00263,0.255319,0.002345,0.034675,0.028077,0.002996,0.008469,0.006922,0.007428,0.014274,0.007488,0.528137,0.003528,0.006803,0.090909
3,23,0.004772,0.264243,0.006732,0.024057,0.027858,0.003226,0.010874,0.007485,0.00623,0.036546,0.00427,0.50439,0.003787,0.004623,0.090909
4,34,0.006647,0.220657,0.002836,0.080419,0.044653,0.00393,0.021072,0.008391,0.014164,0.059632,0.001888,0.433152,0.004281,0.007369,0.090909


| Version | Name | Local Score | Kaggle Score | Notes |
| ------- | ---- | ----------- | ------------ | ----- |
| 1 | submission_XGBClassifier_-0.7405 | -0.7405 | 2.212 | flux, flux_ratio, flux_by_flux, detected - No Kbest |
| 2 | submission_XGBClassifier_-0.7307 | -0.7307 | 2.264 | Diff Features, class weights |
| 3 | submission_XGBClassifier_-1.4849 | -1.4849 | 2.215 | tuned xgb with params from kaggle discussion |