In [1]:
import re

import multiprocessing as mul
from multiprocessing.dummy import Pool
import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False

def cateval(df, c):
    print('percent na: ', df[c].isnull().mean())
    t = pd.crosstab(df[c], df.Target, normalize='index').sort_values(c)
    t['total_count'] = df[c].value_counts()
    t['normalized'] = t.total_count/t.total_count.sum()
    return t

identifiers = [
    'Census_ProcessorModelIdentifier',
    'Census_FirmwareManufacturerIdentifier',
    'Census_FirmwareVersionIdentifier',
    'Census_OEMNameIdentifier',
    'Census_OEMModelIdentifier',
    'Census_OSInstallLanguageIdentifier',
    'IeVerIdentifier',
    'Census_ProcessorManufacturerIdentifier',
    'Census_ProcessorModelIdentifier',
    'AVProductStatesIdentifier',
    'OrganizationIdentifier',
    'CityIdentifier',
    'Census_FlightRing',
    'Census_PowerPlatformRoleName',
    'Census_OSWUAutoUpdateOptionsName',
    'Census_GenuineStateName',
]

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

train = train.reindex(columns=identifiers + ['MachineIdentifier', 'Target', 'SmartScreen'])
test = test.reindex(columns=identifiers + ['MachineIdentifier', 'SmartScreen'])

In [3]:
for c in tqdm(identifiers):
    train[c] = train[c].astype(str)
    train[c] = train[c].fillna('missing')
    
    test[c] = test[c].astype(str)
    test[c] = test[c].fillna('missing')

100%|██████████| 16/16 [03:00<00:00, 12.18s/it]


In [4]:
cols  = ['MachineIdentifier', 'SmartScreen'] + identifiers
train = train[cols+['Target']]
test  = test[cols]

In [5]:
rename = {
    i: i+'_filled'
    for i in identifiers
}
rename['SmartScreen'] = 'SmartScreen_filled'

train = train.rename(columns=rename)
test = test.rename(columns=rename)

In [6]:
train.loc[
    (train.Census_PowerPlatformRoleName_filled == 'UNKNOWN') |
    (train.Census_PowerPlatformRoleName_filled == 'Unspecified') |
    (train.Census_PowerPlatformRoleName_filled == 'nan') |
    (train.Census_PowerPlatformRoleName_filled.isnull()), 'Census_PowerPlatformRoleName_filled'
] = 'missing'

test.loc[
    (test.Census_PowerPlatformRoleName_filled == 'UNKNOWN') |
    (test.Census_PowerPlatformRoleName_filled == 'Unspecified') |
    (test.Census_PowerPlatformRoleName_filled == 'nan') |
    (test.Census_PowerPlatformRoleName_filled.isnull()), 'Census_PowerPlatformRoleName_filled'
] = 'missing'

In [7]:
def transform(df):
    df.SmartScreen_filled = df.SmartScreen_filled.astype(str)
    
    mask = \
        (df.SmartScreen_filled == '&#x01;')  | \
        (df.SmartScreen_filled == '&#x02;')  | \
        (df.SmartScreen_filled == '&#x03;')  | \
        (df.SmartScreen_filled == '0')       | \
        (df.SmartScreen_filled == '00000000')| \
        (df.SmartScreen_filled == 'nan')
    
    df.loc[mask, 'SmartScreen_filled'] = 'invalid'
    
    df.loc[
        (df.SmartScreen_filled == 'promt') | 
        (df.SmartScreen_filled == 'enabled'), 
        'SmartScreen_filled'
    ] = 'prompt'
    
    df.SmartScreen_filled = df.SmartScreen_filled.fillna('missing')
    return df

train = transform(train)
test = transform(test)

In [8]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

train_ = train.merge(avsig_combined, on='MachineIdentifier', how='inner', sort=False)
assert len(train_) == len(train)

train = train_
del train_

train = train.sort_values(by='AvSigVersion_float').drop(columns='AvSigVersion_float')

In [9]:
train.head()

Unnamed: 0,MachineIdentifier,SmartScreen_filled,Census_ProcessorModelIdentifier_filled,Census_ProcessorModelIdentifier_filled.1,Census_FirmwareManufacturerIdentifier_filled,Census_FirmwareVersionIdentifier_filled,Census_OEMNameIdentifier_filled,Census_OEMModelIdentifier_filled,Census_OSInstallLanguageIdentifier_filled,IeVerIdentifier_filled,...,Census_ProcessorModelIdentifier_filled.2,Census_ProcessorModelIdentifier_filled.3,AVProductStatesIdentifier_filled,OrganizationIdentifier_filled,CityIdentifier_filled,Census_FlightRing_filled,Census_PowerPlatformRoleName_filled,Census_OSWUAutoUpdateOptionsName_filled,Census_GenuineStateName_filled,Target
0,c3c4bc04dc5f1c7245a862e52634428e,ExistsNotSet,1984.0,1984.0,142.0,9125.0,1002.0,290911.0,26.0,74.0,...,1984.0,1984.0,3371.0,18.0,68423.0,Retail,Mobile,UNKNOWN,IS_GENUINE,0
37,60031444d3ec616c6e9084be521faa04,invalid,2742.0,2742.0,152.0,8386.0,666.0,264574.0,37.0,137.0,...,2742.0,2742.0,59792.0,,142358.0,Unknown,Mobile,UNKNOWN,IS_GENUINE,0
38,d938abff6012c1488b851247a3098160,invalid,3082.0,3082.0,142.0,33041.0,2102.0,275359.0,7.0,137.0,...,3082.0,3082.0,59680.0,,87917.0,Retail,Mobile,FullAuto,IS_GENUINE,0
39,910ddd20c6d334ca03a46d9f0008fe24,invalid,2616.0,2616.0,355.0,20988.0,1443.0,275893.0,8.0,137.0,...,2616.0,2616.0,56914.0,27.0,2073.0,Retail,Desktop,FullAuto,IS_GENUINE,1
40,5e05d22ab9db72ccbc8e41d4bc632f64,RequireAdmin,2756.0,2756.0,355.0,4320.0,1443.0,275839.0,37.0,53.0,...,2756.0,2756.0,53742.0,18.0,78681.0,NOT_SET,Desktop,UNKNOWN,IS_GENUINE,0


In [10]:
test.head()

Unnamed: 0,MachineIdentifier,SmartScreen_filled,Census_ProcessorModelIdentifier_filled,Census_ProcessorModelIdentifier_filled.1,Census_FirmwareManufacturerIdentifier_filled,Census_FirmwareVersionIdentifier_filled,Census_OEMNameIdentifier_filled,Census_OEMModelIdentifier_filled,Census_OSInstallLanguageIdentifier_filled,IeVerIdentifier_filled,Census_ProcessorManufacturerIdentifier_filled,Census_ProcessorModelIdentifier_filled.2,Census_ProcessorModelIdentifier_filled.3,AVProductStatesIdentifier_filled,OrganizationIdentifier_filled,CityIdentifier_filled,Census_FlightRing_filled,Census_PowerPlatformRoleName_filled,Census_OSWUAutoUpdateOptionsName_filled,Census_GenuineStateName_filled
7252423,ec6910b4d9e0baae203e9819227659ec,Off,2302.0,2302.0,355.0,20317.0,1443.0,256581.0,29.0,137.0,5.0,2302.0,2302.0,56014.0,,139986.0,Retail,Mobile,UNKNOWN,IS_GENUINE
6804872,ddd66992da9cbb12db76d9d874fedf8b,invalid,837.0,837.0,142.0,52485.0,4908.0,317701.0,29.0,,1.0,837.0,837.0,15320.0,,128117.0,Retail,Desktop,Off,IS_GENUINE
6882538,e05db268c5f1e48e5fa63de1f39f02d7,Off,2412.0,2412.0,628.0,50361.0,2668.0,212717.0,29.0,137.0,5.0,2412.0,2412.0,14130.0,18.0,1933.0,Retail,Desktop,FullAuto,IS_GENUINE
6856130,df81a38177efaac6b95df42ddef504e6,RequireAdmin,3484.0,3484.0,142.0,35257.0,1980.0,317708.0,7.0,137.0,5.0,3484.0,3484.0,34178.0,,97401.0,Retail,Desktop,FullAuto,IS_GENUINE
2544324,52eb832b198099b467d39481a77afcef,RequireAdmin,2276.0,2276.0,93.0,16250.0,2206.0,343299.0,8.0,205.0,5.0,2276.0,2276.0,56060.0,27.0,91646.0,Retail,Workstation,FullAuto,IS_GENUINE


In [11]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [12]:
cols = list(set(train.columns)-{'MachineIdentifier', 'Target'})

for c in tqdm(cols):
    train[c] = train[c].astype('category')
    test[c] = test[c].astype('category')

100%|██████████| 16/16 [01:12<00:00,  2.96s/it]


In [13]:
pbar = tqdm(total=len(cols))
#for c in cols:        
def fx(c):
    #pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_), overwrite=True)
    except ValueError:
        pass
    
    pbar.update(1)
    
pool = Pool(mul.cpu_count())
pool.map(fx, cols)
pool.close()
pool.join()

100%|██████████| 16/16 [15:40<00:00, 11.92s/it]  