In [1]:
import re

import multiprocessing as mul
from multiprocessing.dummy import Pool
import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [3]:
def transform(df):
    df = df.copy()
    df['MajorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[2]))
    df['MinorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[3]))
    df['EngineVersion_float'] = df.EngineVersion.apply(lambda x: float('.'.join(x.split('.')[2:])))

    df['MajorAppVersion'] = df.AppVersion.apply(lambda x: int(x.split('.')[1]))
    df['MinorAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[2])

    mlen = np.max([len(v) for v in df['MinorAppVersion']])
    df['MinorAppVersion'] = df.MinorAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

    df['FinestAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[3])

    mlen = np.max([len(v) for v in df['FinestAppVersion']])
    df['FinestAppVersion'] = df.FinestAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

    df['AppVersion_float'] = [
        float(f'{t[0]}.{t[1]}{t[2]}') for t in df[
            ['MajorAppVersion', 'MinorAppVersion', 'FinestAppVersion']
        ].itertuples()
    ]
    
    def intx(x, i):
        x = x.split('.')[i]
        return int(x) if isint(x) else np.nan
    
    def floatx(x, i, j=None):
        if j is not None:
            x = '.'.join(x.split('.')[i:j])
        else:
             x = '.'.join(x.split('.')[i:])
                          
        return float(x) if isfloat(x) else np.nan
    
    
    df['MajorAvSigVersion'] = df.AvSigVersion.apply(intx, i=1)
    df['MinorAvSigVersion'] = df.AvSigVersion.apply(intx, i=2)
    df['AvSigVersion_float'] = df.AvSigVersion.apply(floatx, i=1, j=3)

    df['Census_MajorOSVersion'] = df.Census_OSVersion.apply(intx, i=2)
    df['Census_MinorOSVersion'] = df.Census_OSVersion.apply(intx, i=3)
    df['Census_OSVersion_float'] = df.Census_OSVersion.apply(floatx, i=2)
    
    return df

In [4]:
cols  = ['EngineVersion', 'AppVersion', 'AvSigVersion', 'Census_OSVersion', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

In [5]:
train = transform(train)
test  = transform(test)

In [6]:
train.drop(columns=['EngineVersion', 'AppVersion', 'Census_OSVersion', 'AvSigVersion'], inplace=True, errors='ignore')
test.drop(columns=['EngineVersion', 'AppVersion', 'Census_OSVersion', 'AvSigVersion'], inplace=True, errors='ignore')

In [7]:
train = train.sort_values(by='AvSigVersion_float')
test  = test.sort_values(by='AvSigVersion_float')

In [8]:
train.head()

Unnamed: 0,MachineIdentifier,Target,MajorEngineVersion,MinorEngineVersion,EngineVersion_float,MajorAppVersion,MinorAppVersion,FinestAppVersion,AppVersion_float,MajorAvSigVersion,MinorAvSigVersion,AvSigVersion_float,Census_MajorOSVersion,Census_MinorOSVersion,Census_OSVersion_float
6822125,c3c4bc04dc5f1c7245a862e52634428e,0,15100,1,15100.1,9,110586,101106,6822126.0,0.0,0,0.0,10586,1176,10586.1176
3345950,60031444d3ec616c6e9084be521faa04,0,15200,1,15200.1,13,117134,100001,3345950.0,0.0,0,0.0,17134,1,17134.1
7570941,d938abff6012c1488b851247a3098160,0,14600,4,14600.4,13,117134,100228,7570941.0,0.0,0,0.0,17134,285,17134.285
5054364,910ddd20c6d334ca03a46d9f0008fe24,1,14306,0,14306.0,13,117134,100228,5054364.0,0.0,0,0.0,17134,228,17134.228
3277045,5e05d22ab9db72ccbc8e41d4bc632f64,0,15200,1,15200.1,8,110240,117443,3277046.0,0.0,0,0.0,10240,17443,10240.17443


In [9]:
test.head()

Unnamed: 0,MachineIdentifier,MajorEngineVersion,MinorEngineVersion,EngineVersion_float,MajorAppVersion,MinorAppVersion,FinestAppVersion,AppVersion_float,MajorAvSigVersion,MinorAvSigVersion,AvSigVersion_float,Census_MajorOSVersion,Census_MinorOSVersion,Census_OSVersion_float
7252423,ec6910b4d9e0baae203e9819227659ec,15400,5,15400.5,13,117134,100001,7252423.0,0,0,0.0,17134,1,17134.1
7626698,f899923f0a42719f588608018611c8c3,15400,5,15400.5,12,116299,100015,7626698.0,0,0,0.0,17134,376,17134.376
5153700,a7ebe733cd84fa1e029e641c8f3f14fd,15400,4,15400.4,13,117134,100320,5153700.0,0,0,0.0,17134,345,17134.345
6444125,d2137b74e1993cbaf04b365b0130df53,15300,6,15300.6,13,117134,100320,6444125.0,0,0,0.0,17134,320,17134.32
3735425,79bc7126412e62b121be8ad4d5f4097d,14600,4,14600.4,12,117007,118022,3735425.0,0,0,0.0,16299,431,16299.431


In [10]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [11]:
pbar = tqdm(total=len(train.columns)-2)

#for c in train.columns:
def fx(c):
    if c == 'MachineIdentifier' or c == 'Target':
        return
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_), overwrite=False)
    except ValueError:
        pass
    
    pbar.update(1)
    
pool = Pool(mul.cpu_count())
pool.map(fx, train.columns.tolist())
pool.close()
pool.join()

100%|██████████| 13/13 [16:18<00:00, 23.44s/it]  