In [2]:
import pandas as pd
import numpy as np
import os
import gc
import sys
from tqdm import *
import time
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter, defaultdict
import itertools
from sklearn.metrics import accuracy_score

In [3]:
def precise(y,y_pred,k):
    f = np.exp(-k*np.abs(y-y_pred)/np.maximum(np.abs(y),10**(-15)))
    return f

In [4]:
sub = pd.read_csv('../data/template_submit_result.csv', parse_dates=['ts'])
col = sub.columns.tolist()
index = ['ts', 'wtid']
enum_col = ['var016', 'var020', 'var047']
bool_col = ['var053', 'var066']
double_col = [i for i in col if i not in index+enum_col+bool_col]

In [5]:
try:
    rule = pd.read_csv('../data/rule33.csv')
except:
    %%time
    freq = defaultdict(list)
    path = '../data/dataset/'
    for i,j in tqdm(enumerate(os.listdir(path))):
        f_path = path + j + '/201807.csv'
        wtid_v = int(i + 1)
        data = pd.read_csv(f_path, parse_dates=['ts'])

        sub_1 = sub[sub['wtid'] == wtid_v]
        ts_v = list(set(data['ts'].values)&set(sub_1['ts'].values))
        submit = sub_1[~sub_1['ts'].isin(ts_v)]
        data = pd.concat([data,submit],axis=0)
        data = data.sort_values(by=['ts']).reset_index(drop=True)    
        for col in data.columns[2:]:
            mylist = data[col]
            mylist_mode_ratio = mylist.value_counts()/359000
            m1_score = precise(data[col], mylist.min(), 100).mean()
            m2_score = precise(data[col], mylist.mean(), 100).mean()
            m3_score = precise(data[col], mylist.median(), 100).mean()
            m4_score = precise(data[col], mylist.max(), 100).mean()
            m5_score = precise(data[col], mylist.mode()[0], 100).mean()
            freq['wtid'].append(wtid_v)
            freq['feature'].append(col)
            freq['feature_nunique'].append(mylist.nunique())
            freq['mode_ratio'].append(mylist_mode_ratio.values[0])
            freq['std'].append(mylist.std())
            freq['min'].append(mylist.min())
            freq['mean'].append(mylist.mean())
            freq['median'].append(mylist.median())
            freq['max'].append(mylist.max())
            freq['mode'].append(mylist.mode()[0])
            freq['mean_score'].append(m2_score)
            freq['median_score'].append(m3_score)
            freq['mode_score'].append(m5_score)
    rule = pd.DataFrame(freq)
    save_path = '../data/'
    if not os.path.exists(save_path):
        print(save_path)
        os.makedirs(save_path)
    rule.to_csv('../data/rule33.csv', index=False)

In [7]:
col_1 = ['var001', 'var002', 'var003', 'var004', 'var005', 'var006',
       'var007', 'var008', 'var009', 'var010', 'var011', 'var012', 'var013',
       'var014', 'var015', 'var016', 'var017', 'var018', 'var019', 'var020',
       'var021', 'var022', 'var023', 'var024', 'var025', 'var026', 'var027',
       'var028', 'var029', 'var030', 'var031', 'var032', 'var033', 'var034',
       'var035', 'var036', 'var037', 'var038', 'var039', 'var040', 'var041',
       'var042', 'var043', 'var044', 'var045', 'var046', 'var047', 'var048',
       'var049', 'var050', 'var051', 'var052', 'var053', 'var054', 'var055',
       'var056', 'var057', 'var058', 'var059', 'var060', 'var061', 'var062',
       'var063', 'var064', 'var065', 'var066', 'var067', 'var068']
linear_col = ['var001','var002','var003','var006','var007',
         'var011','var012','var014','var015','var018','var021',
         'var022','var024','var027','var028','var029','var030','var031','var033',
         'var034','var035','var036','var038','var040', 'var043',
         'var045', 'var051','var052','var056','var057','var061','var067',
         'var068','var009','var017','var037','var048','var055','var062']

In [11]:
"""以众数频率线下验证插值"""
rule1 = rule[rule['mode_ratio']>0.72]
rule2 = rule[(rule['mode_ratio']<=0.72)&(rule['mode_ratio']>0.275)]
rule3 = rule[(rule['mode_ratio']<=0.275)&(rule['mode_ratio']>0.055)]

In [12]:
%%time
path = '../data/dataset/'
df = pd.DataFrame()
st = 0
print('start shape: {}'.format(sub.shape))
for i,z in tqdm(enumerate(os.listdir(path))):
    f_path = path + z + '/201807.csv'
    wtid_v = int(i + 1)
    data = pd.read_csv(f_path, parse_dates=['ts'])
    
    sub_1 = sub[sub['wtid'] == wtid_v]
    st += len(sub_1)
    ts_v = list(set(data['ts'].values)&set(sub_1['ts'].values))
    submit = sub_1[~sub_1['ts'].isin(ts_v)]
    data = pd.concat([data,submit],axis=0)
    data = data.sort_values(by=['wtid', 'ts']).reset_index(drop=True)
    mode_rule1 = rule1[rule1['wtid'] == wtid_v]['feature'].values.tolist()
    mode_rule2 = rule2[rule2['wtid'] == wtid_v]['feature'].values.tolist()
    mode_rule3 = rule3[rule3['wtid'] == wtid_v]['feature'].values.tolist()
    data = data.set_index('ts')
    """插值填充"""
    data['var053'] = data['var053'].interpolate()
    for j in col_1: 
        if j in mode_rule1:
            data[j] = data[j].interpolate(method='nearest',limit=6)
            data[j] = data[j].fillna(data[j].value_counts().index[0])
        elif j in mode_rule2:
            data[j] = data[j].rolling('3T').median()
            data[j] = data[j].interpolate(method='nearest',limit=5)
            data[j] = data[j].fillna(data[j].value_counts().index[0])
        elif j in mode_rule3 + ['var034', 'var056', 'var059', 'var068']:
            data[j] = data[j].rolling('2T').median()
            data[j] = data[j].interpolate(method='nearest')
        elif j in ['var016','var020','var047','var066']:
            data[j] = data[j].interpolate(method='nearest',limit = 6)
            data[j] = data[j].fillna(0)
        elif j in linear_col:
            data[j] = data[j].rolling('T').mean()
            data[j] = data[j].interpolate()     
        else:
            data[j] = data[j].interpolate(method='nearest')
            
    data = data.reset_index()
    sub_1 = sub_1[['ts','wtid']].merge(data,on=['ts','wtid'],how='left')
    df = pd.concat([df,sub_1],axis=0)
    del data; gc.collect()
print('end shape: {}'.format(df.shape))
print('nan max number: {}'.format(df.isnull().sum().max()))

start shape: (497191, 70)


33it [06:23, 12.02s/it]


end shape: (497191, 70)
nan max number: 0
Wall time: 6min 23s


In [None]:
sub = sub[['ts','wtid']].merge(df, on=['ts','wtid'], how='left')

In [2]:
sub['var008'] = 0
sub['var053'] = sub['var053'].abs().round()
for i in ['var016', 'var020', 'var047', 'var066']:
        sub[i] = sub[i].astype('int32')
sub['var053'] = sub['var053'].astype('int32')
for col in sub.columns[2:]:
    sub[col] = sub[col].round(2)

In [2]:
save_path = '../data/submit/'
if not os.path.exists(save_path):
    print(save_path)
    os.makedirs(save_path)
sub.to_csv(save_path+'pink_1.csv',index=False)

../data/submi/
