In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False
    
ff_int = {
    'invalid':0,
    'other':1,
    'unkown':2,
    'desktop':3,
    'lowprofiledesktop':4,
    'pizzabox':5,
    'minitower':6,
    'tower':7,
    'portable':8,
    'laptop':9,
    'notebook':10,
    'handheld':11,
    'dockingstation':12,
    'allinone':13,
    'subnotebook':14,
    'spacesaving':15,
    'lunchbox':16,
    'mainserverchassis':17,
    'expansionchassis':19,
    'subchassis':20,
    'busexpansionchassis':21,
    'peripheralchassis':22,
    'raidchassis':23,
    'rackmountchassis':24,
    'sealedcasepc':25,
    'multisystemchassis':26,
    'compactpci':27,
    'advancedtca':28,
    'blade':29,
    'bladeenclosure':30,
    'tablet':31,
    'convertible':32,
    'detachable':33,
    'iotgateway':34,
    'embeddedpc':35,
    'minipc':36,
    'stickpc':37
}
int_ff = { v:k for k, v in ff_int.items() }

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [3]:
def transform(df):
    df = df.copy()
    df.Census_ChassisTypeName = df.Census_ChassisTypeName.astype(str).apply(str.lower)
    df.Census_MDC2FormFactor = df.Census_MDC2FormFactor.astype(str).apply(str.lower)
    
    df['Census_MDC2_FormFactor_isValid'] = [
        (isint(x) and x in int_ff.keys())  or (isinstance(x, str) and x in ff_int.keys())
        for x in tqdm(
            df.Census_MDC2FormFactor, 
            desc='checking for valid ff in form factor'
        )
    ]

    df['Census_ChassisTypeName_isValid'] = [
        (isint(x) and x in int_ff.keys()) or (isinstance(x, str) and x in ff_int.keys())
        for x in tqdm(
            df.Census_ChassisTypeName, 
            desc='checking for valid ff in chassis type'
        )
    ]

    df.loc[~df.Census_MDC2_FormFactor_isValid, 'Census_MDC2FormFactor']  = 'invalid'
    df.loc[~df.Census_ChassisTypeName_isValid, 'Census_ChassisTypeName'] = 'invalid'    

    step, subsets = 5, {}
    for s in df.Census_MDC2FormFactor.unique():
        subsets[s] = {s[i:i+step] for i in range(len(s)-step+1)}
        
    df['Census_FFMatch'] = [
        any([
            x in str(z) for x in subsets[y]
        ])
        for y, z in tqdm(
            zip(df.Census_MDC2FormFactor, df.Census_ChassisTypeName), 
            desc='checking ff matches', 
            total=len(df)
        )
    ]
    
    df.Census_MDC2FormFactor = df.Census_MDC2FormFactor.astype('category')
    df.Census_ChassisTypeName = df.Census_ChassisTypeName.astype('category')
    return df

In [4]:
cols  = ['Census_MDC2FormFactor', 'Census_ChassisTypeName', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

In [5]:
train = transform(train)

checking for valid ff in form factor: 100%|██████████| 8921483/8921483 [00:19<00:00, 460861.14it/s]
checking for valid ff in chassis type: 100%|██████████| 8921483/8921483 [00:19<00:00, 466603.59it/s]
checking ff matches: 100%|██████████| 8921483/8921483 [00:15<00:00, 587706.58it/s]


In [6]:
test  = transform(test)

checking for valid ff in form factor: 100%|██████████| 7853253/7853253 [00:17<00:00, 453965.74it/s]
checking for valid ff in chassis type: 100%|██████████| 7853253/7853253 [00:17<00:00, 460527.31it/s]
checking ff matches: 100%|██████████| 7853253/7853253 [00:13<00:00, 588461.19it/s]


In [7]:
train = train.rename(columns={
    'Census_MDC2FormFactor':'Census_MDC2_FormFactor_clean',
    'Census_ChassisTypeName':'Census_ChassisTypeName_clean'
})
test  = test.rename(columns={
    'Census_MDC2FormFactor':'Census_MDC2_FormFactor_clean',
    'Census_ChassisTypeName':'Census_ChassisTypeName_clean'
})

In [8]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

train_ = train.merge(avsig_combined, on='MachineIdentifier', how='inner', sort=False)
assert len(train_) == len(train)

train = train_
del train_

train = train.sort_values(by='AvSigVersion_float').drop(columns='AvSigVersion_float')

In [9]:
train.head()

Unnamed: 0,Census_MDC2_FormFactor_clean,Census_ChassisTypeName_clean,MachineIdentifier,Target,Census_MDC2_FormFactor_isValid,Census_ChassisTypeName_isValid,Census_FFMatch
0,notebook,notebook,c3c4bc04dc5f1c7245a862e52634428e,0,True,True,True
37,notebook,laptop,60031444d3ec616c6e9084be521faa04,0,True,True,False
38,notebook,notebook,d938abff6012c1488b851247a3098160,0,True,True,True
39,desktop,minitower,910ddd20c6d334ca03a46d9f0008fe24,1,True,True,False
40,allinone,desktop,5e05d22ab9db72ccbc8e41d4bc632f64,0,True,True,False


In [10]:
test.head()

Unnamed: 0,Census_MDC2_FormFactor_clean,Census_ChassisTypeName_clean,MachineIdentifier,Census_MDC2_FormFactor_isValid,Census_ChassisTypeName_isValid,Census_FFMatch
7252423,notebook,portable,ec6910b4d9e0baae203e9819227659ec,True,True,False
6804872,desktop,desktop,ddd66992da9cbb12db76d9d874fedf8b,True,True,True
6882538,allinone,allinone,e05db268c5f1e48e5fa63de1f39f02d7,True,True,True
6856130,desktop,desktop,df81a38177efaac6b95df42ddef504e6,True,True,True
2544324,desktop,desktop,52eb832b198099b467d39481a77afcef,True,True,True


In [11]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [12]:
pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_))
    except ValueError:
        pass
    
    pbar.update(1)

Census_FFMatch: 100%|██████████| 5/5 [03:14<00:00, 33.55s/it]                