In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False
    
ff_int = {
    'invalid':0,
    'other':1,
    'unkown':2,
    'desktop':3,
    'lowprofiledesktop':4,
    'pizzabox':5,
    'minitower':6,
    'tower':7,
    'portable':8,
    'laptop':9,
    'notebook':10,
    'handheld':11,
    'dockingstation':12,
    'allinone':13,
    'subnotebook':14,
    'spacesaving':15,
    'lunchbox':16,
    'mainserverchassis':17,
    'expansionchassis':19,
    'subchassis':20,
    'busexpansionchassis':21,
    'peripheralchassis':22,
    'raidchassis':23,
    'rackmountchassis':24,
    'sealedcasepc':25,
    'multisystemchassis':26,
    'compactpci':27,
    'advancedtca':28,
    'blade':29,
    'bladeenclosure':30,
    'tablet':31,
    'convertible':32,
    'detachable':33,
    'iotgateway':34,
    'embeddedpc':35,
    'minipc':36,
    'stickpc':37
}
int_ff = { v:k for k, v in ff_int.items() }

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [3]:
def transform(df):
    df = df.copy()
    df.Census_ChassisTypeName = df.Census_ChassisTypeName.astype(str).apply(str.lower)
    df.Census_MDC2FormFactor = df.Census_MDC2FormFactor.astype(str).apply(str.lower)
    
    df['Census_MDC2_FormFactor_isValid'] = [
        (isint(x) and x in int_ff.keys())  or (isinstance(x, str) and x in ff_int.keys())
        for x in tqdm(
            df.Census_MDC2FormFactor, 
            desc='checking for valid ff in form factor'
        )
    ]

    df['Census_ChassisTypeName_isValid'] = [
        (isint(x) and x in int_ff.keys()) or (isinstance(x, str) and x in ff_int.keys())
        for x in tqdm(
            df.Census_ChassisTypeName, 
            desc='checking for valid ff in chassis type'
        )
    ]

    df.loc[~df.Census_MDC2_FormFactor_isValid, 'Census_MDC2FormFactor']  = 'invalid'
    df.loc[~df.Census_ChassisTypeName_isValid, 'Census_ChassisTypeName'] = 'invalid'    

    step, subsets = 5, {}
    for s in df.Census_MDC2FormFactor.unique():
        subsets[s] = {s[i:i+step] for i in range(len(s)-step+1)}
        
    df['Census_FFMatch'] = [
        any([
            x in str(z) for x in subsets[y]
        ])
        for y, z in tqdm(
            zip(df.Census_MDC2FormFactor, df.Census_ChassisTypeName), 
            desc='checking ff matches', 
            total=len(df)
        )
    ]
    
    df.Census_MDC2FormFactor = df.Census_MDC2FormFactor.astype('category')
    df.Census_ChassisTypeName = df.Census_ChassisTypeName.astype('category')
    return df

In [4]:
cols  = ['Census_MDC2FormFactor', 'Census_ChassisTypeName', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

In [5]:
train = transform(train)

checking for valid ff in form factor: 100%|██████████| 8921483/8921483 [00:14<00:00, 616050.04it/s]
checking for valid ff in chassis type: 100%|██████████| 8921483/8921483 [00:14<00:00, 605302.77it/s]
checking ff matches: 100%|██████████| 8921483/8921483 [00:12<00:00, 703480.91it/s]


In [6]:
test  = transform(test)

checking for valid ff in form factor: 100%|██████████| 7853253/7853253 [00:12<00:00, 618757.16it/s]
checking for valid ff in chassis type: 100%|██████████| 7853253/7853253 [00:12<00:00, 620501.97it/s]
checking ff matches: 100%|██████████| 7853253/7853253 [00:10<00:00, 714860.01it/s]


In [14]:
train = train.rename(columns={
    'Census_MDC2FormFactor':'Census_MDC2FormFactor_validated',
    'Census_ChassisTypeName':'Census_ChassisTypeName_validated'
})
test  = test.rename(columns={
    'Census_MDC2FormFactor':'Census_MDC2FormFactor_validated',
    'Census_ChassisTypeName':'Census_ChassisTypeName_validated'
})

In [15]:
train.head()

Unnamed: 0,Census_MDC2FormFactor_validated,Census_ChassisTypeName_validated,MachineIdentifier,Target,Census_MDC2_FormFactor_isValid,Census_ChassisTypeName_isValid,Census_FFMatch
6822125,notebook,notebook,c3c4bc04dc5f1c7245a862e52634428e,0,True,True,True
7285638,notebook,notebook,d106fcb0c6482265956c05ffbaf60744,0,True,True,True
5050150,desktop,desktop,90eeb2d77a5f58c0afe71de24f29bb50,0,True,True,True
165482,desktop,minitower,04c1c463cbb6e2bfae34c4c66fd3242c,0,True,True,False
735046,desktop,desktop,151dd3600408f025207073d09cbc6d5d,0,True,True,True


In [16]:
test.head()

Unnamed: 0,Census_MDC2FormFactor_validated,Census_ChassisTypeName_validated,MachineIdentifier,Census_MDC2_FormFactor_isValid,Census_ChassisTypeName_isValid,Census_FFMatch
7252423,notebook,portable,ec6910b4d9e0baae203e9819227659ec,True,True,False
6804872,desktop,desktop,ddd66992da9cbb12db76d9d874fedf8b,True,True,True
6882538,allinone,allinone,e05db268c5f1e48e5fa63de1f39f02d7,True,True,True
6856130,desktop,desktop,df81a38177efaac6b95df42ddef504e6,True,True,True
2544324,desktop,desktop,52eb832b198099b467d39481a77afcef,True,True,True


In [10]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [17]:
pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_))
    except ValueError:
        pass
    
    pbar.update(1)

Census_FFMatch: 100%|██████████| 5/5 [6:12:42<00:00, 4357.17s/it]                    