In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
np.random.seed(42)
python_random.seed(42)
set_config('seed', 42)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [None]:
train_err.shape, train_err_0.shape

In [4]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

Unnamed: 0,user_id,date_cnt,date_sum
0,10000,30,317
1,10001,30,2365
2,10002,29,306
3,10003,30,306
4,10004,30,777


Unnamed: 0,user_id,date_cnt,date_sum
0,30000,29,2750
1,30001,28,284
2,30002,30,941
3,30003,28,371
4,30004,30,881


(15000, 3)

In [5]:
train = train_err_1.copy()
test = test_err_1.copy()

In [6]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

In [7]:
for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

((15000, 34), (14998, 34))

In [8]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
#train_err_2['model_fwver_rank'] = train_err_2.model_fwver_cnt.rank()
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
#test_err_2['model_fwver_rank'] = test_err_2.model_fwver_cnt.rank()
display(test_err_2.head())

Unnamed: 0,user_id,model_fwver,model_fwver_cnt
0,10000,model_305.15.2138,317
1,10001,model_204.33.1185,379
2,10001,model_204.33.1261,1986
3,10002,model_305.15.2138,306
4,10003,model_204.33.1185,81


Unnamed: 0,user_id,model_fwver,model_fwver_cnt
0,30000,model_104.16.3553,2320
1,30000,model_104.16.3571,367
2,30000,model_204.33.1261,63
3,30001,model_305.15.2138,284
4,30002,model_004.22.1750,733


In [9]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

24062it [01:06, 359.33it/s]
24096it [01:07, 356.32it/s]


In [None]:
train.info()

In [10]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

In [11]:
for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

((15000, 75), (14998, 75))

In [12]:
# train_err > errtype => rank
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
#train_err_3['errtype_rank'] = train_err_3.errtype_cnt.rank()
display(train_err_3.head())

# test_err > errtype => rank
test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
#test_err_3['errtype_rank'] = test_err_3.errtype_cnt.rank()
display(test_err_3.head())

Unnamed: 0,user_id,errtype,errtype_cnt
0,10000,3,8
1,10000,4,104
2,10000,6,1
3,10000,7,1
4,10000,10,7


Unnamed: 0,user_id,errtype,errtype_cnt
0,30000,5,62
1,30000,6,1
2,30000,7,1
3,30000,11,16
4,30000,12,16


In [13]:
train.columns

Index(['user_id', 'date_cnt', 'date_sum', 'model_403.11.1167',
       'model_804.73.2571', 'model_004.22.1750', 'model_610', 'model_68.5.3',
       'model_204.33.1185', 'model_705.66.3237', 'model_104.16.3571',
       'model_004.22.1778', 'model_104.16.3569', 'model_204.33.1171',
       'model_305.15.2120', 'model_504.82.1684', 'model_305.15.3104',
       'model_504.82.1778', 'model_104.16.3553', 'model_504.82.1730',
       'model_403.11.1149', 'model_104.16.3439', 'model_305.15.2092',
       'model_305.15.2138', 'model_004.22.1656', 'model_204.33.1125',
       'model_204.33.1261', 'model_004.22.1666', 'model_403.11.1141',
       'model_705.66.3571', 'model_305.15.2114', 'model_204.33.1149',
       'model_804.73.2237', 'model_004.22.1684', 'E1', 'E2', 'E3', 'E4', 'E5',
       'E6', 'E7', 'E8', 'E9', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16',
       'E17', 'E18', 'E19', 'E20', 'E21', 'E22', 'E23', 'E24', 'E25', 'E26',
       'E27', 'E28', 'E30', 'E31', 'E32', 'E33', 'E34', 'E35', 

In [14]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

231039it [10:22, 371.16it/s]


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 75 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   user_id            15000 non-null  int64
 1   date_cnt           15000 non-null  int64
 2   date_sum           15000 non-null  int64
 3   model_403.11.1167  15000 non-null  int64
 4   model_804.73.2571  15000 non-null  int64
 5   model_004.22.1750  15000 non-null  int64
 6   model_610          15000 non-null  int64
 7   model_68.5.3       15000 non-null  int64
 8   model_204.33.1185  15000 non-null  int64
 9   model_705.66.3237  15000 non-null  int64
 10  model_104.16.3571  15000 non-null  int64
 11  model_004.22.1778  15000 non-null  int64
 12  model_104.16.3569  15000 non-null  int64
 13  model_204.33.1171  15000 non-null  int64
 14  model_305.15.2120  15000 non-null  int64
 15  model_504.82.1684  15000 non-null  int64
 16  model_305.15.3104  15000 non-null  int64
 17  model_504.82

In [16]:
for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    


231241it [10:20, 372.72it/s]


In [None]:
# train_err > errtype_code => rank
train_err_4 = train_err.groupby(['model_fwver','errtype_code']).count().reset_index()[['model_fwver','errtype_code','user_id']]
train_err_4.columns = ['model_fwver','errtype_code','cnt']
train_err_4['rank'] = train_err_4.cnt.rank()
display(train_err_4.head())

In [None]:
# test_err > errtype_code => rank
test_err_4 = test_err.groupby(['model_fwver','errtype_code']).count().reset_index()[['model_fwver','errtype_code','user_id']]
test_err_4.columns = ['model_fwver','errtype_code','cnt']
test_err_4['rank'] = test_err_4.cnt.rank()
display(test_err_4.head())

In [None]:
train = train_err.merge(train_err_1, on=['user_id'], how='left').merge(train_err_4, on=['model_fwver','errtype_code'], how='left')#.merge(train_err_3, on='errtype_code', how='left')
train = train[['user_id','date_cnt','date_sum','model_fwver','errtype_code','cnt','rank']].drop_duplicates()
train = train.groupby(['user_id','date_cnt','date_sum']).agg({'errtype_code':['count'],'cnt':['sum'],'rank':['max']}).reset_index()
train.columns = ['user_id','date_cnt','date_sum','errtype_code_cnt','cnt_sum','rank_max']
train

In [None]:
test = test_err.merge(test_err_1, on=['user_id'], how='left').merge(test_err_4, on=['model_fwver','errtype_code'], how='left')#.merge(test_err_3, on='errtype_code', how='left')
test = test[['user_id','date_cnt','date_sum','model_fwver','errtype_code','cnt','rank']].drop_duplicates()
test = test.groupby(['user_id','date_cnt','date_sum']).agg({'errtype_code':['count'],'cnt':['sum'],'rank':['max']}).reset_index()
test.columns = ['user_id','date_cnt','date_sum','errtype_code_cnt','cnt_sum','rank_max']
test

In [17]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [18]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [19]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [20]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train_qua_1

In [None]:
%%time
for col in cols:
    train_qua_1[col] = train_qua_1[col].apply(chg_qua)
    test_qua_1[col] = test_qua_1[col].apply(chg_qua)

In [None]:
cols=['quality_0', 'quality_1','quality_2','quality_5',  'quality_6','quality_7', 'quality_10',]

In [None]:
train_qua_1.drop(cols, axis=1, inplace=True)
test_qua_1.drop(cols, axis=1, inplace=True)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(test_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [21]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [22]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,prob
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [23]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)

In [24]:
display(train.head())
display(test.head())

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,prob
0,10000,30,317,0,0,0,0,0,0,0,...,0.0,4.0,0.0,4.0,0.0,4.0,4.0,0.0,0.0,0.0
1,10001,30,2365,0,0,0,0,0,379,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,10002,29,306,0,0,0,0,0,0,0,...,2.0,22.0,4.0,22.0,0.0,22.0,22.0,1.0,0.0,0.0
3,10003,30,306,0,0,0,0,0,81,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004,30,777,0,0,645,0,0,0,0,...,1.0,6.0,2.0,6.0,0.0,6.0,6.0,1.0,0.0,1.0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,29,2750,0,0,0,0,0,0,0,...,0.0,0.0,2.0,1.0,2.0,0.0,2.0,2.0,0.0,0.0
1,30001,28,284,0,0,0,0,0,0,0,...,1.0,1.0,10.0,1.0,10.0,0.0,10.0,10.0,1.0,0.0
2,30002,30,941,0,0,733,0,0,0,0,...,3.0,3.0,26.0,3.0,26.0,0.0,26.0,26.0,3.0,0.0
3,30003,28,371,0,0,246,0,0,0,0,...,0.0,0.0,13.0,5.0,13.0,0.0,13.0,13.0,0.0,0.0
4,30004,30,881,0,0,0,0,0,0,0,...,1.0,1.0,5.0,3.0,5.0,0.0,5.0,5.0,1.0,0.0


In [25]:
train.shape, test.shape

((15000, 87), (14998, 86))

In [26]:
train.to_csv("./train.csv")
test.to_csv("./test.csv")

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [27]:
cols = test.columns[1:]
cols = list(cols)

In [28]:
min = train[cols].min()
max  = train[cols].max()

In [29]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [30]:
train.prob = train.prob.astype(int)

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 14999
Data columns (total 87 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            15000 non-null  int64  
 1   date_cnt           15000 non-null  float64
 2   date_sum           15000 non-null  float64
 3   model_403.11.1167  15000 non-null  float64
 4   model_804.73.2571  15000 non-null  float64
 5   model_004.22.1750  15000 non-null  float64
 6   model_610          15000 non-null  float64
 7   model_68.5.3       15000 non-null  float64
 8   model_204.33.1185  15000 non-null  float64
 9   model_705.66.3237  15000 non-null  float64
 10  model_104.16.3571  15000 non-null  float64
 11  model_004.22.1778  15000 non-null  float64
 12  model_104.16.3569  15000 non-null  float64
 13  model_204.33.1171  15000 non-null  float64
 14  model_305.15.2120  15000 non-null  float64
 15  model_504.82.1684  15000 non-null  float64
 16  model_305.15.3104  150

In [32]:
train.columns[1:86]

Index(['date_cnt', 'date_sum', 'model_403.11.1167', 'model_804.73.2571',
       'model_004.22.1750', 'model_610', 'model_68.5.3', 'model_204.33.1185',
       'model_705.66.3237', 'model_104.16.3571', 'model_004.22.1778',
       'model_104.16.3569', 'model_204.33.1171', 'model_305.15.2120',
       'model_504.82.1684', 'model_305.15.3104', 'model_504.82.1778',
       'model_104.16.3553', 'model_504.82.1730', 'model_403.11.1149',
       'model_104.16.3439', 'model_305.15.2092', 'model_305.15.2138',
       'model_004.22.1656', 'model_204.33.1125', 'model_204.33.1261',
       'model_004.22.1666', 'model_403.11.1141', 'model_705.66.3571',
       'model_305.15.2114', 'model_204.33.1149', 'model_804.73.2237',
       'model_004.22.1684', 'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8',
       'E9', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18',
       'E19', 'E20', 'E21', 'E22', 'E23', 'E24', 'E25', 'E26', 'E27', 'E28',
       'E30', 'E31', 'E32', 'E33', 'E34', 'E35', 'E36', 'E37

In [33]:
%%time
clf = setup(session_id=42, data=train, target='prob'
           , numeric_features=train.columns[1:86])

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 87)"
4,Missing Values,False
5,Numeric Features,86
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 19 s


In [34]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.794,0.8164,0.5034,0.8056,0.6195,0.4884,0.5139,12.0787
1,Light Gradient Boosting Machine,0.7926,0.812,0.5163,0.7885,0.6238,0.4892,0.5102,0.5639
2,Gradient Boosting Classifier,0.7865,0.8111,0.4671,0.8131,0.5931,0.4624,0.4951,3.5967
3,Extra Trees Classifier,0.7849,0.8021,0.4937,0.7805,0.6048,0.4672,0.4903,0.6293
4,Extreme Gradient Boosting,0.7817,0.797,0.5343,0.7385,0.6198,0.4722,0.4845,1.8301
5,Ada Boost Classifier,0.7772,0.796,0.4909,0.7552,0.5948,0.4506,0.4706,0.8786
6,Random Forest Classifier,0.7662,0.7737,0.4637,0.7377,0.5692,0.42,0.4415,0.1218
7,Linear Discriminant Analysis,0.7505,0.7378,0.3297,0.8079,0.4682,0.341,0.3995,0.1691


In [35]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7819,0.8028,0.4814,0.7801,0.5954,0.4573,0.4823
1,0.8071,0.8295,0.5386,0.8214,0.6506,0.5252,0.5475
2,0.7938,0.8139,0.5029,0.8055,0.6192,0.488,0.5134
3,0.8024,0.8265,0.5014,0.8417,0.6285,0.5054,0.5368
4,0.7866,0.8015,0.4943,0.7864,0.607,0.4708,0.4947
Mean,0.7944,0.8149,0.5037,0.807,0.6201,0.4893,0.515
SD,0.0094,0.0116,0.019,0.0226,0.0189,0.0241,0.0246


In [36]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.788,0.8172,0.4867,0.7987,0.6048,0.4714,0.4984


In [37]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 42s


In [38]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 86)

In [39]:
predictions = predict_model(final_model, data = test_x)

In [40]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.9249
1,30001,0.1846
2,30002,0.2835
3,30003,0.8403
4,30004,0.8811


In [53]:
train.iloc[:,-1]

0        0
1        1
2        0
3        0
4        1
        ..
14995    0
14996    0
14997    1
14998    1
14999    0
Name: prob, Length: 15000, dtype: int32

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_valid, Y_train, Y_valid = train_test_split(train.iloc[:,0:86], train.iloc[:,-1], test_size=0.25, random_state=42)

X_train.shape, X_valid.shape, Y_train.shape, Y_valid.shape

((11250, 86), (3750, 86), (11250,), (3750,))

In [56]:
train.shape

(15000, 87)

In [131]:
import tensorflow as tf
from tensorflow.keras import datasets
from tensorflow.keras import layers
from tensorflow.keras.backend import mean, maximum
from sklearn.metrics import roc_auc_score
from tensorflow.keras import backend as K

tf.random.set_seed(42)

In [125]:
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

In [129]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [213]:
epoch = 100

model = tf.keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(15000, 86)),
    layers.Dense(58, activation='relu'),
    layers.Dense(26, activation='relu'),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy','roc_curve','auc'])
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy', tf.keras.metrics.AUC()])
#model.compile(loss="mse", optimizer='adam', metrics=[tf.keras.metrics.AUC()])

#model.compile(loss='mse', optimizer='adam')
hist = model.fit(X_train, Y_train, epochs=epoch, batch_size=48, validation_split=0.25
,callbacks = [tf.keras.callbacks.EarlyStopping(patience=5, mode='min', monitor='val_loss', verbose=1)])
pred = model.predict_proba(test)
print(model.evaluate(X_train, Y_train))

pred[:48]

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
[13.209041595458984, 0.679111123085022, 0.5]


array([[4.0395332e-31],
       [2.7287009e-31],
       [3.1669910e-31],
       [3.0885338e-31],
       [3.0361533e-31],
       [2.9101436e-31],
       [2.6286144e-31],
       [2.4902034e-31],
       [3.4377363e-31],
       [2.9481342e-31],
       [2.9645542e-31],
       [2.7367486e-31],
       [4.3241425e-31],
       [2.6233254e-31],
       [2.9641019e-31],
       [2.9563777e-31],
       [3.0243166e-31],
       [3.5398329e-31],
       [2.8929654e-31],
       [2.6948128e-31],
       [3.4000711e-31],
       [3.0875913e-31],
       [2.8699704e-31],
       [2.5693688e-31],
       [2.9562874e-31],
       [3.2011459e-31],
       [2.9661832e-31],
       [2.5875852e-31],
       [2.7650820e-31],
       [2.7424758e-31],
       [2.5722715e-31],
       [2.6163295e-31],
       [2.7656725e-31],
       [2.9008779e-31],
       [2.9682657e-31],
       [2.5244061e-31],
       [2.8394358e-31],
       [2.8029315e-31],
       [2.6905806e-31],
       [2.4111954e-31],
       [2.8577782e-31],
       [2.580724

In [41]:
sample_submssion.to_csv("./submission/submission_20210128-5.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.9249
1,30001,0.1846
2,30002,0.2835
3,30003,0.8403
4,30004,0.8811
...,...,...
14994,44994,0.2264
14995,44995,0.2615
14996,44996,0.4698
14997,44997,0.8369
