In [43]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
import warnings
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')

In [44]:
df = pd.read_csv('df.csv')

In [45]:
mean_std_df = df.agg(['mean', 'std'])
norm_columns = ['location_x','location_y','remaining_min','remaining_sec','distance_of_shot','add_loc','sub_loc']
mean_std_df = mean_std_df[norm_columns]

In [46]:
def normalize(df):
    df[norm_columns] = (df[norm_columns] - mean_std_df.loc['mean', :]) / mean_std_df.loc['std', :].apply(lambda x: 1 if x == 0 else x)
    no_variance = mean_std_df.loc['std', mean_std_df.loc['std', :] == 0].index.tolist()
    print('Dropped no_variance columns: {}'.format(len(no_variance)))
    df.drop(no_variance, axis=1, inplace=True)
    return df

df = normalize(df)

Dropped no_variance columns: 0


In [47]:
df.drop(['match_event_id','add_loc','day'],inplace=True,axis=1)

In [48]:
train_df = df.dropna(axis=0, how='any', subset=['is_goal'])
test_df = df[~df.index.isin(train_df.index)]
df.dropna(axis=0, how='any', subset=['is_goal'], inplace=True )

In [49]:
train_labels = train_df['is_goal']
train_df.drop(['is_goal'],axis=1,inplace=True)

In [50]:
columns = [c for c in train_df.columns if c not in ['shot_id_number']]

cat_variable =[]
k =0
for i in train_df[columns].dtypes:
    if(i == 'O'):
        cat_variable.append(k)    
    k =k+1

In [51]:

param = {  "loss_function" : "Logloss",
           "eval_metric":"AUC",
           "task_type":"GPU",
           "learning_rate":   0.07, #4506133538414295,#trial.suggest_loguniform('learning_rate', 1e-2, 1e-1),
           "iterations":70000,
           "l2_leaf_reg":   197, #trial.suggest_int('l2_leaf_reg', 10, 200),
           "random_seed" : 432013,
           "od_type" : "Iter",
          # "depth" : 5,
           "max_depth":   4, #trial.suggest_int('max_depth', 2, 16),
           "early_stopping_rounds" : 500,
           "border_count" :64 , #trial.suggest_int('border_count', 16, 512),
             "bagging_temperature" :   2,   #trial.suggest_int('bagging_temperature', 2, 40)
           "task_type" : 'GPU'
        }

    
print(param)

train_df_columns = columns#df_train.columns#columns[:-1]

cat_feature_inds =cat_variable

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df, train_labels)):
    print("fold {}".format(fold_))
    X_train, y_train = train_df.iloc[trn_idx][train_df_columns], train_labels.iloc[trn_idx]
    X_valid, y_valid = train_df.iloc[val_idx][train_df_columns], train_labels.iloc[val_idx]
    _train = Pool(X_train, label=y_train, cat_features = cat_feature_inds)
    _valid = Pool(X_valid, label=y_valid, cat_features = cat_feature_inds)

    clf = CatBoostClassifier(**param)
    clf.fit(
            _train,
            eval_set=_valid,
            use_best_model=True,
            verbose=200,
            #plot=True
    )

    oof[val_idx] = clf.predict_proba(train_df.iloc[val_idx][train_df_columns])[:,1]

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_df_columns
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions += clf.predict_proba(test_df[train_df_columns])[:,1] / folds.n_splits

{'loss_function': 'Logloss', 'eval_metric': 'AUC', 'task_type': 'GPU', 'learning_rate': 0.07, 'iterations': 70000, 'l2_leaf_reg': 197, 'random_seed': 432013, 'od_type': 'Iter', 'max_depth': 4, 'early_stopping_rounds': 500, 'border_count': 64, 'bagging_temperature': 2}
fold 0
0:	learn: 0.5975412	test: 0.5888380	best: 0.5888380 (0)	total: 37.7ms	remaining: 44m 1s
200:	learn: 0.6605293	test: 0.6399838	best: 0.6399906 (197)	total: 6.55s	remaining: 37m 53s
400:	learn: 0.6748350	test: 0.6443210	best: 0.6443210 (400)	total: 13s	remaining: 37m 39s
600:	learn: 0.6842963	test: 0.6472690	best: 0.6473011 (599)	total: 19.5s	remaining: 37m 28s
800:	learn: 0.6918111	test: 0.6487302	best: 0.6487453 (797)	total: 26.1s	remaining: 37m 35s
1000:	learn: 0.6982959	test: 0.6484368	best: 0.6488041 (802)	total: 32.6s	remaining: 37m 27s
1200:	learn: 0.7036481	test: 0.6490800	best: 0.6491039 (1198)	total: 39.4s	remaining: 37m 36s
1400:	learn: 0.7077955	test: 0.6496868	best: 0.6497034 (1399)	total: 46.4s	remainin

In [52]:
sub = pd.DataFrame({"shot_id_number": test_df.shot_id_number})
sub["is_goal"] = predictions

In [54]:
sub=sub.astype({'shot_id_number': 'int64'},inplace=True)
su = pd.read_csv("sample_submission.csv")
su.drop("is_goal",axis=1,inplace=True)
z=su.merge(sub,how='left',on='shot_id_number')
z.to_csv("submission_c.csv",index=False)

In [56]:
test_df = test_df.drop(['is_goal'],axis=1)

In [57]:
features = [c for c in train_df.columns if c not in ['shot_id_number']]
param = {
    'bagging_freq': 2,          'bagging_fraction': 0.5,   'boost_from_average':'false',   'boost': 'random_forest',
    'feature_fraction': 1,   'learning_rate': 0.1,     'max_depth': -1,                'metric':'auc',
    'min_data_in_leaf': 80,'num_leaves': 14,           'num_threads': 8,
    'tree_learner': 'serial',   'objective': 'binary',      'verbosity': 1
}
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=2319)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, train_labels.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=train_labels.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=train_labels.iloc[val_idx])
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(roc_auc_score(train_labels, oof)))
sub = pd.DataFrame({"shot_id_number": test_df.shot_id_number})
sub["is_goal"] = predictions

Fold 0
Training until validation scores don't improve for 4000 rounds.
Early stopping, best iteration is:
[6]	training's auc: 0.643233	valid_1's auc: 0.646731
Fold 1
Training until validation scores don't improve for 4000 rounds.
Early stopping, best iteration is:
[180]	training's auc: 0.658907	valid_1's auc: 0.619488
Fold 2
Training until validation scores don't improve for 4000 rounds.
Early stopping, best iteration is:
[34]	training's auc: 0.657248	valid_1's auc: 0.623512
Fold 3
Training until validation scores don't improve for 4000 rounds.
Early stopping, best iteration is:
[3]	training's auc: 0.650386	valid_1's auc: 0.579897
Fold 4
Training until validation scores don't improve for 4000 rounds.
Early stopping, best iteration is:
[62]	training's auc: 0.65484	valid_1's auc: 0.645395
CV score: 0.61899 


In [58]:
sub=sub.astype({'shot_id_number': 'int64'},inplace=True)
su = pd.read_csv("sample_submission.csv")
su.drop("is_goal",axis=1,inplace=True)
z1=su.merge(sub,how='left',on='shot_id_number')
z1.fillna(z1.is_goal.mean(),inplace=True)
z1.to_csv("submission_l.csv",index=False)

In [59]:
z2 = sub = pd.DataFrame({"shot_id_number": z1.shot_id_number})
z2['is_goal'] = (z['is_goal']+z1['is_goal'])/2
z2.to_csv("combine.csv",index=False)