In [2]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False

In [3]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [12]:
def transform(df):
    df = df.copy()
    df['MajorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[2]))
    df['MinorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[3]))
    df['EngineVersion_float'] = df.EngineVersion.apply(lambda x: float('.'.join(x.split('.')[2:])))

    df['MajorAppVersion'] = df.AppVersion.apply(lambda x: int(x.split('.')[1]))
    df['MinorAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[2])

    mlen = np.max([len(v) for v in df['MinorAppVersion']])
    df['MinorAppVersion'] = df.MinorAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

    df['FinestAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[3])

    mlen = np.max([len(v) for v in df['FinestAppVersion']])
    df['FinestAppVersion'] = df.FinestAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

    df['AppVersion_float'] = [
        float(f'{t[0]}.{t[1]}{t[2]}') for t in df[
            ['MajorAppVersion', 'MinorAppVersion', 'FinestAppVersion']
        ].itertuples()
    ]
    
    def intx(x, i):
        x = x.split('.')[i]
        return int(x) if isint(x) else np.nan
    
    def floatx(x, i, j=None):
        if j is not None:
            x = '.'.join(x.split('.')[i:j])
        else:
             x = '.'.join(x.split('.')[i:])
                          
        return float(x) if isfloat(x) else np.nan
    
    
    df['MajorAvSigVersion'] = df.AvSigVersion.apply(intx, i=1)
    df['MinorAvSigVersion'] = df.AvSigVersion.apply(intx, i=2)
    df['AvSigVersion_float'] = df.AvSigVersion.apply(floatx, i=1, j=3)

    df['Census_MajorOSVersion'] = df.Census_OSVersion.apply(intx, i=2)
    df['Census_MinorOSVersion'] = df.Census_OSVersion.apply(intx, i=3)
    df['Census_OSVersion_float'] = df.Census_OSVersion.apply(floatx, i=2)
    
    return df

In [4]:
cols  = ['EngineVersion', 'AppVersion', 'AvSigVersion', 'Census_OSVersion', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

In [13]:
train = transform(train)
test  = transform(test)

In [19]:
train.head()

Unnamed: 0,MachineIdentifier,Target,MajorEngineVersion,MinorEngineVersion,EngineVersion_float,MajorAppVersion,MinorAppVersion,FinestAppVersion,AppVersion_float,MajorAvSigVersion,MinorAvSigVersion,AvSigVersion_float,Census_MajorOSVersion,Census_MinorOSVersion,Census_OSVersion_float
6822125,c3c4bc04dc5f1c7245a862e52634428e,0,15100,1,15100.1,9,110586,101106,6822126.0,0.0,0,0.0,10586,1176,10586.1176
7285638,d106fcb0c6482265956c05ffbaf60744,0,15100,1,15100.1,12,116299,100015,7285638.0,0.0,0,0.0,16299,371,16299.371
5050150,90eeb2d77a5f58c0afe71de24f29bb50,0,15200,1,15200.1,9,110586,101106,5050151.0,0.0,0,0.0,10586,1176,10586.1176
165482,04c1c463cbb6e2bfae34c4c66fd3242c,0,13701,0,13701.0,13,117134,100228,165482.1,0.0,0,0.0,17134,285,17134.285
735046,151dd3600408f025207073d09cbc6d5d,0,15200,1,15200.1,9,110586,101106,735046.9,0.0,0,0.0,10586,1176,10586.1176


In [20]:
test.head()

Unnamed: 0,MachineIdentifier,MajorEngineVersion,MinorEngineVersion,EngineVersion_float,MajorAppVersion,MinorAppVersion,FinestAppVersion,AppVersion_float,MajorAvSigVersion,MinorAvSigVersion,AvSigVersion_float,Census_MajorOSVersion,Census_MinorOSVersion,Census_OSVersion_float
7252423,ec6910b4d9e0baae203e9819227659ec,15400,5,15400.5,13,117134,100001,7252423.0,0,0,0.0,17134,1,17134.1
6804872,ddd66992da9cbb12db76d9d874fedf8b,14500,5,14500.5,13,117134,100320,6804872.0,0,0,0.0,17134,286,17134.286
6882538,e05db268c5f1e48e5fa63de1f39f02d7,14600,4,14600.4,13,117134,100228,6882538.0,0,0,0.0,17134,285,17134.285
6856130,df81a38177efaac6b95df42ddef504e6,15400,4,15400.4,18,101809,100002,6856130.0,0,0,0.0,17134,345,17134.345
2544324,52eb832b198099b467d39481a77afcef,15400,4,15400.4,18,101807,118075,2544324.0,0,0,0.0,17763,55,17763.55


In [21]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [18]:
train.drop(columns=['EngineVersion', 'AppVersion', 'Census_OSVersion', 'AvSigVersion'], inplace=True, errors='ignore')
test.drop(columns=['EngineVersion', 'AppVersion', 'Census_OSVersion', 'AvSigVersion'], inplace=True, errors='ignore')

In [22]:
pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_))
    except ValueError:
        pass
    
    pbar.update(1)


  0%|          | 0/13 [00:00<?, ?it/s][A
MajorEngineVersion:   0%|          | 0/13 [00:00<?, ?it/s][A
MajorEngineVersion:   8%|▊         | 1/13 [1:12:10<14:26:05, 4330.43s/it][A
MinorEngineVersion:   8%|▊         | 1/13 [1:12:10<14:26:05, 4330.43s/it][A
MinorEngineVersion:  15%|█▌        | 2/13 [2:29:29<13:30:52, 4422.95s/it][A
EngineVersion_float:  15%|█▌        | 2/13 [2:29:29<13:30:52, 4422.95s/it][A
EngineVersion_float:  23%|██▎       | 3/13 [3:49:44<12:36:46, 4540.65s/it][A
MajorAppVersion:  23%|██▎       | 3/13 [3:49:44<12:36:46, 4540.65s/it]    [A
MajorAppVersion:  31%|███       | 4/13 [5:09:55<11:33:15, 4621.68s/it][A
MinorAppVersion:  31%|███       | 4/13 [5:09:55<11:33:15, 4621.68s/it][A
MinorAppVersion:  38%|███▊      | 5/13 [5:57:24<9:05:18, 4089.82s/it] [A
FinestAppVersion:  38%|███▊      | 5/13 [5:57:24<9:05:18, 4089.82s/it][A
FinestAppVersion:  46%|████▌     | 6/13 [6:29:58<6:42:24, 3449.16s/it][A
AppVersion_float:  46%|████▌     | 6/13 [6:29:58<6:42:24, 34