# Import

In [1]:
import pandas
from sklearn.ensemble import GradientBoostingClassifier
import evaluation

# Read training data

In [2]:
folder = 'tau_data/'
train = pandas.read_csv(folder + 'training.csv', index_col='id')

In [3]:
train.head()

Unnamed: 0_level_0,LifeTime,dira,FlightDistance,FlightDistanceError,IP,IPSig,VertexChi2,pt,DOCAone,DOCAtwo,...,p1_p,p2_p,p0_eta,p1_eta,p2_eta,SPDhits,production,signal,mass,min_ANNmuon
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18453471,0.001578,0.999999,14.033335,0.681401,0.016039,0.451886,1.900433,1482.037476,0.066667,0.060602,...,12290.760742,39264.398438,3.076006,4.0038,4.031514,458,-99,0,1866.300049,0.277559
5364094,0.000988,0.999705,5.536157,0.302341,0.142163,9.564503,0.865666,3050.720703,0.024022,0.019245,...,16562.667969,7341.257812,3.228553,2.786543,2.975564,406,-99,0,1727.095947,0.225924
11130990,0.000877,0.999984,6.117302,0.276463,0.034746,1.970751,10.975849,3895.908691,0.055044,0.047947,...,22695.388672,10225.30957,3.536903,2.865686,3.05281,196,-99,0,1898.588013,0.36863
15173787,0.000854,0.999903,5.228067,0.220739,0.076389,4.271331,3.276358,4010.781738,0.053779,0.006417,...,16909.515625,9141.426758,3.087461,3.218034,2.375592,137,-99,0,1840.410034,0.246045
1102544,0.001129,0.999995,39.069534,1.898197,0.120936,4.984982,0.468348,4144.546875,0.004491,0.037326,...,97612.804688,47118.785156,4.632295,4.711155,4.296878,477,-99,0,1899.793945,0.22206


# Define training features
Here we use subset of the all features to pass the agreement checking

In [4]:
variables = ['LifeTime',
             'FlightDistance',
             'pt',
             ]

# Baseline training

In [5]:
baseline = GradientBoostingClassifier(n_estimators=40, learning_rate=0.01, subsample=0.7,
                                      min_samples_leaf=10, max_depth=7, random_state=11)
baseline.fit(train[variables], train['signal'])

GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=40,
              random_state=11, subsample=0.7, verbose=0, warm_start=False)

# Check agreement test

In [11]:
check_agreement = pandas.read_csv(folder + 'check_agreement.csv', index_col='id')
agreement_probs = baseline.predict_proba(check_agreement[variables])[:, 1]

ks = evaluation.compute_ks(
    agreement_probs[check_agreement['signal'].values == 0],
    agreement_probs[check_agreement['signal'].values == 1],
    check_agreement[check_agreement['signal'] == 0]['weight'].values,
    check_agreement[check_agreement['signal'] == 1]['weight'].values)
#print 'KS metric', ks, ks < 0.09


# Check correlation test

In [7]:
check_correlation = pandas.read_csv(folder + 'check_correlation.csv', index_col='id')
correlation_probs = baseline.predict_proba(check_correlation[variables])[:, 1]
cvm = evaluation.compute_cvm(correlation_probs, check_correlation['mass'])
print 'CvM metric', cvm, cvm < 0.002

CvM metric 0.000981509354914 True


# Compute weighted AUC on the training data with min_ANNmuon > 0.4

In [8]:
train_eval = train[train['min_ANNmuon'] > 0.4]
train_probs = baseline.predict_proba(train_eval[variables])[:, 1]
AUC = evaluation.roc_auc_truncated(train_eval['signal'], train_probs)
print 'AUC', AUC

AUC 0.834346386627


In [18]:
print train_probs

[ 0.62099332  0.61875278  0.58634486 ...,  0.65732514  0.65230287
  0.57613771]


# Predict test, create file for kaggle

In [21]:
test = pandas.read_csv(folder + 'test.csv', index_col='id')
result = pandas.DataFrame({'id': test.index})
result['prediction'] = baseline.predict_proba(test[variables])[:, 1]

In [22]:
print result

              id  prediction
0       14711831    0.523875
1       16316387    0.629466
2        6771382    0.637163
3         686045    0.524218
4        8755882    0.655426
5       10247299    0.641789
6        2985764    0.580083
7        7050757    0.620993
8       12921966    0.521370
9       18405499    0.548906
10      13271738    0.540304
11       4989026    0.416308
12      11992549    0.658253
13      12230726    0.621372
14       4369817    0.659353
15       8110882    0.587021
16       9954630    0.641386
17      12257830    0.639896
18      15913215    0.640992
19       9795517    0.651639
20      15657409    0.633142
21       1136451    0.625454
22       3859071    0.537551
23      13542987    0.655576
24      11647267    0.633700
25       8047788    0.589113
26       4657554    0.651410
27      14265844    0.656456
28      12659406    0.644846
29       8297573    0.627869
...          ...         ...
855789  11832618    0.560924
855790   5444909    0.654672
855791   68905