In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False 

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [3]:
def transform(df):
    df.loc[df.RtpStateBitfield.isnull(), 'RtpStateBitfield'] = 34

    df.RtpStateBitfield = [
        str(bin(int(float(c))))[2:].zfill(4)[-4:] if str(c) != 'nan' else np.nan 
        for c in tqdm(df.RtpStateBitfield, desc='converting to bit string')
    ]
    
    for i in tqdm(range(1, 5), desc='expanding features'):
        cname = f'RtpStateBitfield_{i}'
        df[cname] = df.RtpStateBitfield.apply(
            lambda x: bool(x[4-i]=='1') if str(x) != 'nan' else np.nan
        )

    return df

In [4]:
cols  = ['RtpStateBitfield', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

In [5]:
train = transform(train)
test  = transform(test)

converting to bit string: 100%|██████████| 8921483/8921483 [00:19<00:00, 456019.65it/s]
expanding features: 100%|██████████| 4/4 [00:20<00:00,  5.04s/it]
converting to bit string: 100%|██████████| 7853253/7853253 [00:16<00:00, 472419.08it/s]
expanding features: 100%|██████████| 4/4 [00:17<00:00,  4.39s/it]


In [6]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

train_ = train.merge(avsig_combined, on='MachineIdentifier', how='inner', sort=False)
assert len(train_) == len(train)

train = train_
del train_

train = train.sort_values(by='AvSigVersion_float').drop(columns='AvSigVersion_float')

In [7]:
train.head()

Unnamed: 0,RtpStateBitfield,MachineIdentifier,Target,RtpStateBitfield_1,RtpStateBitfield_2,RtpStateBitfield_3,RtpStateBitfield_4
0,1,c3c4bc04dc5f1c7245a862e52634428e,0,True,False,False,False
37,101,60031444d3ec616c6e9084be521faa04,0,True,False,True,False
38,1,d938abff6012c1488b851247a3098160,0,True,False,False,False
39,0,910ddd20c6d334ca03a46d9f0008fe24,1,False,False,False,False
40,1,5e05d22ab9db72ccbc8e41d4bc632f64,0,True,False,False,False


In [8]:
test.head()

Unnamed: 0,RtpStateBitfield,MachineIdentifier,RtpStateBitfield_1,RtpStateBitfield_2,RtpStateBitfield_3,RtpStateBitfield_4
7252423,0,ec6910b4d9e0baae203e9819227659ec,False,False,False,False
6804872,0,ddd66992da9cbb12db76d9d874fedf8b,False,False,False,False
6882538,0,e05db268c5f1e48e5fa63de1f39f02d7,False,False,False,False
6856130,1,df81a38177efaac6b95df42ddef504e6,True,False,False,False
2544324,1,52eb832b198099b467d39481a77afcef,True,False,False,False


In [9]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [10]:
train = train.rename(columns={'RtpStateBitfield':'RtpStateBitfield_str'})
test  = test.rename(columns={'RtpStateBitfield':'RtpStateBitfield_str'})

pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_), overwrite=True)
    except ValueError:
        pass
    
    pbar.update(1)

RtpStateBitfield_4: 100%|██████████| 5/5 [11:55<00:00, 142.42s/it]  