In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_qua['date'] = train_qua['time'].astype(str).str.slice(0,8)

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_qua['date'] = test_qua['time'].astype(str).str.slice(0,8)

In [5]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [6]:
train_err.drop('time', axis=1, inplace=True)
test_err.drop('time', axis=1, inplace=True)

In [7]:
train = train_err.merge(train_qua_0, on=['user_id', 'date', 'fwver'])

In [8]:
test = test_err.merge(test_qua_0, on=['user_id', 'date', 'fwver'])

In [9]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)

In [10]:
del train_err, test_err, train_prob, train_prob1, test_qua, train_qua_0, test_qua_0

In [12]:
train = train.drop_duplicates()

In [11]:
test = test.drop_duplicates()

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [None]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

In [None]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [None]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [None]:
train_err.head()

In [None]:
train_err_8 = train_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
train_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

test_err_8 = test_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
test_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

train = train.merge(train_err_8, on='user_id', how='left').fillna(0)
test = test.merge(test_err_8, on='user_id', how='left').fillna(0)

In [None]:
train_err_7 = train_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
train_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

test_err_7 = test_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
test_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

train = train.merge(train_err_7, on='user_id', how='left').fillna(0)
test = test.merge(test_err_7, on='user_id', how='left').fillna(0)

In [13]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,model_nm,fwver,errtype,errcode,date,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,prob
0,10000,model_3,05.15.2138,15,1,20201129,0.0,0,0.0,0,0,0,0,0,4,0,0,0.0
1,10000,model_3,05.15.2138,15,1,20201129,0.0,0,0.0,4,0,0,0,0,4,0,0,0.0
2,10000,model_3,05.15.2138,11,1,20201129,0.0,0,0.0,0,0,0,0,0,4,0,0,0.0
3,10000,model_3,05.15.2138,11,1,20201129,0.0,0,0.0,4,0,0,0,0,4,0,0,0.0
4,10000,model_3,05.15.2138,12,1,20201129,0.0,0,0.0,0,0,0,0,0,4,0,0,0.0


Unnamed: 0,user_id,model_nm,fwver,errtype,errcode,date,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,model_2,04.33.1261,26,1,20201128,0.0,0,0.0,0,0,5,0,0,2,0,0
1,30000,model_2,04.33.1261,26,1,20201128,0.0,0,0.0,2,5,5,0,0,2,0,0
4,30000,model_2,04.33.1261,40,1,20201128,0.0,0,0.0,0,0,5,0,0,2,0,0
5,30000,model_2,04.33.1261,40,1,20201128,0.0,0,0.0,2,5,5,0,0,2,0,0
6,30000,model_2,04.33.1261,40,0,20201128,0.0,0,0.0,0,0,5,0,0,2,0,0


((2411111, 18), (2131719, 17))

In [None]:
train.info()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [None]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [None]:
train.columns[1:-1]

In [14]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           #, numeric_features=train.columns[1:-1]
           #, ignore_features=['err_rank_sum']
           )

IntProgress(value=0, description='Processing: ', max=13)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
user_id,Numeric
model_nm,Categorical
fwver,Categorical
errtype,Numeric
errcode,Categorical
date,Numeric
quality_0,Numeric
quality_1,Numeric
quality_2,Numeric
quality_5,Categorical





MemoryError: Unable to allocate 214. GiB for an array with shape (2411111, 11925) and data type float64

In [15]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

NameError: name 'y' is not defined

In [None]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

In [None]:
pred_holdout = predict_model(blended)

In [None]:
%%time
final_model = finalize_model(blended)

In [None]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

In [None]:
predictions = predict_model(final_model, data = test_x)

In [None]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

In [None]:
sample_submssion.to_csv("./submission/submission_20210201-2.csv", index = False)
sample_submssion