In [None]:
# Data Wrangling
import pandas as pd
import numpy as np
# import pandas_profiling as pp
import gc

# Visualization, EDA
# import missingno as msno
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# Korean font
#font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
#rc('font', family=font_name)

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile

# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import optuna

# Modeling
from lightgbm import LGBMClassifier
# from pycaret.classification import *
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import *
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import ClassifierMixin

# Evaluation
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# from keras.callbacks import ModelCheckpoint

# Utility
import re
import os
import time
import datetime
import random
import tensorflow as tf
import pickle
import joblib
import platform
import warnings; warnings.filterwarnings("ignore")
from itertools import combinations
from tqdm import tqdm, tqdm_notebook
from IPython.display import Image
from scipy.stats.mstats import gmean
from tensorflow import keras
from PIL import Image

# 2. 데이터 불러오기

In [None]:
# 주피터 파일일때(주석처리)
panel = pd.read_csv(os.path.abspath("../input") + '/panel.csv')
survey = pd.read_csv(os.path.abspath("../input") + '/survey.csv')
response_train = pd.read_csv(os.path.abspath("../input") + '/response_train.csv')
response_test = pd.read_csv(os.path.abspath("../input") + '/response_test.csv')

# 성별, 지역 값이 Nan 값인 패널 1명 test에도 존재 하지 않아 그냥 drop
panel.drop(8315, axis=0, inplace=True)
# REGION 컬럼 무의미해서 drop
panel.drop(['REGION'], axis=1, inplace=True)

# CATEGORIES 컬럼 Nan 값도 많고 무의미해서 drop
survey.drop(['CATEGORIES'], axis=1, inplace=True)

# 3. Feature Create

##### (패널 질문 응답수)

In [None]:
# 패널 질문 COUNT(질문의 성의 판단)
panel_Questions = panel.iloc[:,4:]

panel = panel.iloc[:,:4]
panel_Questions_count = []
for i in range(panel_Questions.shape[0]):
            panel_Questions_count.append(pd.notna(panel_Questions.iloc[i,:].values).sum())

# 패널 질문 응답수
panel['Questions_count'] = panel_Questions_count

train = response_train.merge(panel).merge(survey)
test = response_test.merge(panel).merge(survey).sort_values(by='ID')

# train 데이터 분할
before_train = train.query("TIME<='2021-04-01 00:00:00'")
train = train.query("TIME>='2021-04-01 00:00:00'")

##### (패널 ID별 누적 포인트)


In [None]:
# 패널 ID 별 획득 누적 포인트
Points_earned = pd.DataFrame(before_train.query('STATUS==1').groupby('userID')['CPI'].agg([('Points_earned',np.sum)])).reset_index()

train = train.merge(Points_earned, how='left')
train['Points_earned'].fillna(0, inplace = True)
test = test.merge(Points_earned, how='left')
test['Points_earned'].fillna(0, inplace = True)

##### (일자, 요일, 시간, 시간별 범위, 시간 범위별 응답확률), (리워드 포인트별 범위, 리워드 범위별 응답 확률)

In [None]:
data = [train, test]

for i in data:
    i['TIME'] = i['TIME'].astype("datetime64")
    
    i['DAY'] = i['TIME'].dt.day
    
    i['WEEKDAY'] = i['TIME'].dt.dayofweek
    
    i['HOUR'] = i['TIME'].dt.hour
    
    i['HOUR_range'] = pd.cut(i['HOUR'], 4, labels = ['새벽', '오전', '오후', '야간'], right = True, include_lowest=True)
   
    i['CPI_range'] = pd.cut(i['CPI'],
                         bins = [0, 500, 1000, 1500, 2000, 2500 ],
                         labels = ['0~500', '501~1000','1001~1500', '1501~2000', '2001~2500'],
                         right = True, include_lowest=True)
    


WEEKDAY_status_mean = pd.DataFrame(train.groupby('WEEKDAY')['STATUS'].agg([('WEEKDAY_status_mean',np.mean)])).reset_index()

HOUR_range_status_mean = pd.DataFrame(train.groupby('HOUR_range')['STATUS'].agg([('HOUR_range_status_mean',np.mean)])).reset_index()

CPI_range_status_mean = pd.DataFrame(train.groupby('CPI_range')['STATUS'].agg([('CPI_range_status_mean',np.mean)])).reset_index()

train = train.merge(CPI_range_status_mean, how='left').merge(WEEKDAY_status_mean, how='left').merge(HOUR_range_status_mean, how='left')#.merge(IR_range_status_mean, how='left').merge(LOI_range_status_mean, how='left')
test = test.merge(CPI_range_status_mean, how='left').merge(WEEKDAY_status_mean, how='left').merge(HOUR_range_status_mean, how='left')#.merge(IR_range_status_mean, how='left').merge(LOI_range_status_mean, how='left')

##### (평균 리워드 포인트)

In [None]:
# 패널 ID 별 획득 평균 포인트
Points_earned_mean = pd.DataFrame(train.query('STATUS==1').groupby('userID')['CPI'].agg([('Points_earned_mean',np.mean)])).reset_index()

train = train.merge(Points_earned_mean, how='left')
train['Points_earned_mean'].fillna(0, inplace = True)
test = test.merge(Points_earned_mean, how='left')
test['Points_earned_mean'].fillna(0, inplace = True)

##### (TITLE 정규표현식)

In [None]:
# [^ ㄱ-ㅣ가-힣] 정규 표현식
# 다소 시간이 오래 걸림
word_counts = {}
def count_word(x):
    if x['STATUS'] == 1:
        for w in re.sub(r'[^ ㄱ-ㅣ가-힣]', '', x['TITLE']).split():
            word_counts[w] = word_counts.get(w, 0) + 1
def score_word(x):
    score = 0
    for w in re.sub(r'[^ ㄱ-ㅣ가-힣]', '', x['TITLE']).split():
        score += word_counts.get(w, 0)
    return score    
            
train.apply(count_word, axis=1)
train.TITLE = train.apply(score_word, axis=1)
test.TITLE = test.apply(score_word, axis=1)

##### (TYPE 별 응답확률)

In [None]:
TYPE_mean = pd.DataFrame(train.groupby('TYPE')['STATUS'].agg([('TYPE_mean',np.mean)])).reset_index()
train = train.merge(TYPE_mean, how='left')
test = test.merge(TYPE_mean, how='left')

##### (유저 아이디 별 응답확률)

In [None]:
respond = train.query('STATUS == 1').groupby('userID')['userID'].agg([('respond',np.size)]).reset_index()
total = train.groupby('userID')['userID'].agg([('total',np.size)]).reset_index()

train = train.merge(respond).merge(total)

train['Response_Probability'] = train['respond'] / train['total']
test = pd.merge(test, train.drop_duplicates(subset=['userID'])[['userID', 'Response_Probability']], how='left')
test['Response_Probability'] = test['Response_Probability'].fillna(0)

del train['respond'], train['total']

##### (1st Round 1,2,3등의 모든 피쳐를 추가하고 섞어보며 각각 점수를 확인한 결과, 1등과 3등의 피쳐를 적절히 섞는 방식이 가장 점수가 높았음.)

In [1]:
res_freq = train.query('STATUS == 1').groupby('GENDER')['IR'].agg([('g_i','mean')]).reset_index()
train = pd.merge(train, res_freq, how='left')
test = pd.merge(test, res_freq, how='left')
test.FREQ = test.g_i.fillna(0)

NameError: name 'train' is not defined

In [None]:
res_freq = train.query('STATUS == 1').groupby('userID')['userID'].agg([('FREQ',np.size)]).reset_index()
send_freq = train.groupby('userID')['userID'].agg([('SEND',np.size)]).reset_index()
train = pd.merge(train, res_freq, how='left')
train = pd.merge(train, send_freq, how='left')
train['RESRATE'] = train['FREQ'] / train['SEND']
del train['FREQ'], train['SEND']
test = pd.merge(test, train.drop_duplicates(subset=['userID'])[['userID', 'RESRATE']], how='left')
test['RESRATE'] = test['RESRATE'].fillna(0)

In [None]:
train.TIME = train.TIME.astype('datetime64')
test.TIME = test.TIME.astype('datetime64')

In [None]:
train['TIME_hour'] = train.TIME.dt.hour
train['TIME_min'] = train.TIME.dt.minute
train['dayofweek'] = train.TIME.dt.dayofweek

test['TIME_hour'] = test.TIME.dt.hour
test['TIME_min'] = test.TIME.dt.minute
test['dayofweek'] = test.TIME.dt.dayofweek

# 4. Feature Engineering

##### 무의미한 feature 삭제

In [None]:
y_train = train['STATUS']
train.drop('STATUS', axis=1, inplace=True)

test_id = test.iloc[:,0:1]
test.drop('ID', axis=1, inplace=True)

#성능 개선에 무의미한 Feature 제거
train.drop(['userID', 'surveyID', 'TIME', 'BIRTH', 'HOUR_range', 'CPI_range'], axis=1, inplace=True)
test.drop(['userID', 'surveyID', 'TIME', 'BIRTH', 'HOUR_range', 'CPI_range'], axis=1, inplace=True)

##### Feature 전처리

In [None]:
# train, test 병합 후 결측값 처리
features = pd.concat([train, test]).reset_index(drop=True)

features = features.astype({'GENDER':'object'})

cat = features.select_dtypes(include=['object','category']).columns.to_list()# 범주형 변수
num = features.select_dtypes(exclude=['object','category']).columns.to_list()# 수치형 변수

In [None]:
# 이상치 제거 전 수치형 변수 시각화
plt.figure(figsize = (8, 4))
features[num].boxplot()
plt.xticks(fontsize = 10, rotation = 90)
plt.show()

In [None]:
# 수치형 변수 이상치 제거
features[num] = features[num].apply(lambda x: x.clip(x.quantile(.05), x.quantile(.95)), axis=0)

# 이상치 제거 후 수치형 변수 시각화
plt.figure(figsize = (8, 4))
features[num].boxplot()
plt.xticks(fontsize = 10, rotation = 90)
plt.show()

In [None]:
#StandardScaler
from sklearn.preprocessing import StandardScaler

# 스케일링 전 train, test 분할
X_train = features.iloc[:y_train.shape[0], :]
X_test = features.iloc[y_train.shape[0]:, :]

scaler = StandardScaler()
X_train[num] = scaler.fit_transform(X_train[num])
X_test[num] = scaler.transform(X_test[num])

In [None]:
# 원핫 인코딩
df_encoded = pd.get_dummies(pd.concat([X_train, X_test]), columns=cat)
X_train = df_encoded[:X_train.shape[0]]
X_test = df_encoded[X_train.shape[0]:]

# 5. 피쳐 셀렉션

In [None]:
# Method: Using SHAP values 
import shap
from lightgbm import LGBMClassifier

# DF, based on which importance is checked
X_importance = X_test

# Explain model predictions using shap library:
model = LGBMClassifier(random_state=0).fit(X_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_importance)

# Plot summary_plot as barplot:
shap.summary_plot(shap_values, X_importance, plot_type='bar')

shap_sum = np.abs(shap_values).mean(axis=1)[1,:]
importance_df = pd.DataFrame([X_importance.columns.tolist(), shap_sum.tolist()]).T
importance_df.columns = ['column_name', 'shap_importance']
importance_df = importance_df.sort_values('shap_importance', ascending=False)
importance_df

In [None]:
# feature 중요도가 0.1 이상
SHAP_THRESHOLD = 0.1
features_selected = importance_df.query('shap_importance > @SHAP_THRESHOLD').column_name.tolist()
X_train = X_train[features_selected]
X_test = X_test[features_selected]
print(X_train.shape)

In [None]:
# test size 올리기위해 0.2
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=77) 

# 6.딥러닝 모델링

##### 랜덤시드 1,2,3 고정 후 실험

In [None]:
# 매번 모델링을 할 때마다 동일한 결과를 얻기 위해 랜덤 시드 설정 동일하게 유지
# 럭키 시드 찾는건 시간상 어려움

def reset_seeds(reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(1)
    random.seed(2)
    tf.compat.v1.set_random_seed(3)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional
   
reset_seeds()

In [None]:
input = keras.Input(shape=(X_train.shape[1],))

x = keras.layers.Dense(8, activation='relu')(input) # 은닉층 낮추기
output = keras.layers.Dense(1, activation='sigmoid')(x)

DEEP = keras.Model(input, output)

DEEP.summary()

DEEP.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc', keras.metrics.AUC()])

callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience = 5),
             keras.callbacks.ModelCheckpoint(filepath='best_nn_model.h5', monitor='val_loss', save_best_only=True)] # patience 낮추기

mc = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)

hist = DEEP.fit(X_train, y_train, validation_data=(X_val, y_val), 
                 batch_size=128, epochs=150, callbacks=[callbacks, mc], shuffle=False, verbose=2)

In [None]:
# Loss
plt.plot(hist.history["loss"], label="train")
plt.plot(hist.history["val_loss"], label="validation")
plt.legend()
plt.xlabel('epoch')
plt.title("Loss")
plt.show()


# ROC-AUC
plt.plot(hist.history["auc"], label="train")
plt.plot(hist.history["val_auc"], label="validation")
plt.legend()
plt.xlabel('epoch')
plt.title("ROC-AUC")
plt.show()



In [None]:
t = pd.Timestamp.now()
fname = f"dnn_submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
pd.DataFrame({'ID': test_id["ID"], 'STATUS': DEEP.predict(X_test).flatten()}).to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")

# --------------------------------------------------------------

# Submission Ensemble

In [None]:
# X_train, y_train, X_val, y_val, X_test, test_id = pd.read_pickle('comp_data(SHAP = 0.1이상).pkl')

In [None]:
# reset_seeds()함수를 아래와 같이 수정해야 함.
def reset_seeds(SEED, reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(SEED)
    random.seed(SEED)
    tf.compat.v1.set_random_seed(SEED)
#    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional

In [None]:
# 예측값을 저장할 폴더 생성
folder = 'Ensemble'
if not os.path.isdir(folder):
    os.mkdir(folder)

In [None]:
for i in tqdm(range(20)):    
    SEED = np.random.randint(1, 10000)              
    reset_seeds(SEED)
    
    # Define the NN architecture
    input = keras.Input(shape=(X_train.shape[1],))
    x = keras.layers.Dense(8, activation='relu')(input)
    output = keras.layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(input, output)    

    # Choose the optimizer and the cost function
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc', keras.metrics.AUC()])
    
    # Train the model
    callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
    hist = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=150, 
                 callbacks=callbacks, shuffle=False, verbose=0)
    
    # Make submissions
    submission = pd.DataFrame({
        "ID": test_id.ID, 
        "STATUS": model.predict(X_test).flatten()
    })
    t = pd.Timestamp.now()
    fname = f"{folder}/loop_submission_{t.month:02}{t.day:02}_{SEED:05}.csv"
    submission.to_csv(fname, index=False)    

In [None]:
nf = 0
for f in os.listdir(folder):
    ext = os.path.splitext(f)[-1]
    if ext == '.csv': 
        s = pd.read_csv(folder+"/"+f)
    else: 
        continue
    if len(s.columns) !=2:
        continue
    if nf == 0: 
        slist = s
    else: 
        slist = pd.merge(slist, s, on="ID")
    nf += 1

p = 1.5 # 이 값에 따라 성능이 달라짐 (p=0: 기하평균, p=1: 산술평균)    
if nf >= 2:
    if p == 0: 
        pred = 1
        for j in range(nf): pred = pred * slist.iloc[:,j+1]
        pred = pred**(1/nf)
    else:
        pred = 0
        for j in range(nf): pred = pred + slist.iloc[:,j+1]**p
        pred = pred / nf
        pred = pred**(1/p)
    submission = pd.DataFrame({'ID': slist.ID, 'STATUS': pred})
    t = pd.Timestamp.now()
    fname = f"p{p}mean_submission_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv"
    submission.to_csv(fname, index=False)

# --------------------------------------------------------------

In [None]:
def reset_seeds(reset_graph_with_backend=None):
    if reset_graph_with_backend is not None:
        K = reset_graph_with_backend
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("KERAS AND TENSORFLOW GRAPHS RESET")  # optional

    np.random.seed(1)
    random.seed(2)
    tf.compat.v1.set_random_seed(3)
    os.environ['CUDA_VISIBLE_DEVICES'] = ''  # for GPU
    print("RANDOM SEEDS RESET")  # optional
   
reset_seeds()

In [None]:
df_submit = pd.read_csv(os.path.abspath("../submissions") + '/p1.5mean_submission_1214_1414.csv').to_numpy()[:, 1:]
df_2 = pd.read_csv(os.path.abspath("../submissions") + '/0.89319_dnn_submission_12130240.csv').to_numpy()[:, 1:]
df_3 = pd.read_csv(os.path.abspath("../submissions") + '/p20mean_submission_1214_0345.csv').to_numpy()[:, 1:]
df_4 = pd.read_csv(os.path.abspath("../submissions") + '/0.89255_dnn_submission_12130026.csv').to_numpy()[:, 1:]
df_5 = pd.read_csv(os.path.abspath("../submissions") + '/0.89213_dnn_submission_12100106.csv').to_numpy()[:, 1:]
df_6 = pd.read_csv(os.path.abspath("../submissions") + '/1st_submission.csv').to_numpy()[:, 1:]
df_7 = pd.read_csv(os.path.abspath("../submissions") + '/0.89198_dnn_submission_12122349.csv').to_numpy()[:, 1:]
df_8 = pd.read_csv(os.path.abspath("../submissions") + '/0.89180_dnn_submission_12130053.csv').to_numpy()[:, 1:]
df_9 = pd.read_csv(r'C:\Users\Administrator\Desktop\Competition\notebooks\0.89419_dnn_submission_12140036(submission ensemble).csv').to_numpy()[:, 1:]
df_10 = pd.read_csv(os.path.abspath("../submissions") + '/0.89175_dnn_submission_12130142.csv').to_numpy()[:, 1:]
df_11 = pd.read_csv(os.path.abspath("../submissions") + '/p14mean_submission_1214_0318.csv').to_numpy()[:, 1:]
df_12 = pd.read_csv(os.path.abspath("../submissions") + '/0.89169_dnn_submission_12111512.csv').to_numpy()[:, 1:]
df_13 = pd.read_csv(os.path.abspath("../submissions") + '/2nd_submission.csv').to_numpy()[:, 1:]
df_14 = pd.read_csv(os.path.abspath("../submissions") + '/0.89148_dnn_submission_12120154.csv').to_numpy()[:, 1:]
df_15 = pd.read_csv(os.path.abspath("../submissions") + '/0.89097_dnn_submission_12130301.csv').to_numpy()[:, 1:]
df_16 = pd.read_csv(os.path.abspath("../submissions") + '/0.89092_dnn_submission_12120037.csv').to_numpy()[:, 1:]
df_17 = pd.read_csv(r'C:\Users\Administrator\Desktop\Competition\notebooks\0.89387_p1.5mean_submission_1214_1414.csv').to_numpy()[:, 1:]
# df_18 = pd.read_csv(os.path.abspath("../submissions") + '/0.88975_dnn_submission_12120201.csv').to_numpy()[:, 1:]
# df_19 = pd.read_csv(os.path.abspath("../submissions") + '/0.88954_dnn_submission_12111947.csv').to_numpy()[:, 1:]
df_20 = pd.read_csv(os.path.abspath("../submissions") + '/3rd_submission.csv').to_numpy()[:, 1:]
# df_21 = pd.read_csv(os.path.abspath("../submissions") + '/0.88947_dnn_submission_12111458.csv').to_numpy()[:, 1:]
# df_22 = pd.read_csv(os.path.abspath("../submissions") + '/0.88849_dnn_submission_12120123.csv').to_numpy()[:, 1:]
# df_23 = pd.read_csv(os.path.abspath("../submissions") + '/0.88845_dnn_submission_12100247.csv').to_numpy()[:, 1:]
# df_24 = pd.read_csv(os.path.abspath("../submissions") + '/0.88828_dnn_submission_12120108.csv').to_numpy()[:, 1:]
df_25 = pd.read_csv(os.path.abspath("../submissions") + '/p1.5mean_submission_1214_0344.csv').to_numpy()[:, 1:]
# df_26 = pd.read_csv(os.path.abspath("../submissions") + '/0.88563_dnn_submission_12120220.csv').to_numpy()[:, 1:]
# df_27 = pd.read_csv(os.path.abspath("../submissions") + '/0.88511_dnn_submission_12111837.csv').to_numpy()[:, 1:]




In [None]:
pred = 0.8 * df_submit + 0.2 * df_2
pred = 0.6 * pred + 0.4 * df_3
pred = 0.6 * pred + 0.4 * df_4
pred = 0.6 * pred + 0.4 * df_5
pred = 0.6 * pred + 0.4 * df_6
pred = 0.6 * pred + 0.4 * df_7
pred = 0.6 * pred + 0.4 * df_8
pred = 0.6 * pred + 0.4 * df_9
pred = 0.6 * pred + 0.4 * df_10
pred = 0.6 * pred + 0.4 * df_11
pred = 0.6 * pred + 0.4 * df_12
pred = 0.6 * pred + 0.4 * df_13
pred = 0.6 * pred + 0.4 * df_14
pred = 0.6 * pred + 0.4 * df_15
pred = 0.6 * pred + 0.4 * df_16
pred = 0.6 * pred + 0.4 * df_17
# pred = 0.6 * pred + 0.4 * df_18
# pred = 0.6 * pred + 0.4 * df_19
pred = 0.6 * pred + 0.4 * df_20
# pred = 0.6 * pred + 0.4 * df_21
# pred = 0.6 * pred + 0.4 * df_22
# pred = 0.6 * pred + 0.4 * df_23
# pred = 0.6 * pred + 0.4 * df_24
pred = 0.6 * pred + 0.4 * df_25
# pred = 0.6 * pred + 0.4 * df_26
# pred = 0.6 * pred + 0.4 * df_27
pred

In [None]:
t = pd.Timestamp.now()
fname = f"dnn_submission_{t.month:02}{t.day:02}{t.hour:02}{t.minute:02}.csv"
pd.DataFrame({'ID': test_id["ID"], 'STATUS': pred[:,0]}).to_csv(fname, index=False)
print(f"'{fname}' is ready to submit.")