In [1]:
%matplotlib inline

import gc
import os, sys
import multiprocessing as mul
from multiprocessing.dummy import Pool
import time

sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../ctrNet'))

from ctrNet import ctrNet
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from ctrNet import misc_utils as utils
import gc
from tqdm import tqdm_notebook as tqdm
from IPython.core.interactiveshell import InteractiveShell
import seaborn as sns

import dankypipe.pipe as pipe

InteractiveShell.ast_node_interactivity = 'all'
random.seed(42)
np.random.seed(42)

In [2]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion_float':                                  'float16',
        'AppVersion_float':                                     'float16',
        'AvSigVersion_float':                                   'float16',
        'IsBeta':                                               'int8',
        'RtpStateBitfield_1':                                   'int8',
        'RtpStateBitfield_2':                                   'int8',
        'RtpStateBitfield_3':                                   'int8',
        'RtpStateBitfield_4':                                   'int8',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'category',
        'AVProductStatesIdentifier_filled':                     'category',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'category',
        'CityIdentifier_filled':                                'category',
        'OrganizationIdentifier_filled':                        'category',
        'LocaleEnglishNameIdentifier':                          'category',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier_filled':                               'float16',
        'SmartScreen_filled':                                   'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2_FormFactor_clean':                         'category',
        'Census_MDC2_FormFactor_isValid':                       'int8',
        'Census_FFMatch':                                       'int8',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier_filled':                      'float16',
        'Census_OEMModelIdentifier_filled':                     'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier_filled':        'category',
        'Census_ProcessorModelIdentifier_filled':               'category',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName_clean':                         'category',
        'Census_ChassisTypeName_isValid':                       'int8',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName_filled':                  'category',
        'Census_InternalBatteryType_reduced':                   'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition_clean':                               'category',
        'OS_Reduced_Media':                                     'int8',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier_filled':            'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSEdSkuMatch':                                  'int8',
        'Census_OSVersion_float':                               'float16',
        'Census_OSWUAutoUpdateOptionsName_filled':              'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName_filled':                       'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing_filled':                             'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier_filled':         'float16',
        'Census_FirmwareVersionIdentifier_filled':              'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'branch_ver':                                           'category',
        'subrel_ver':                                           'category',
        'subrel_ver_num':                                       'int',
        'branch_release_num':                                   'int',
        'is_svc_release':                                       'int8',
        'is_escrow_release':                                    'int8',
        'is_sec_release':                                       'int8',
        'is_st1_release':                                       'int8',
        'is_prerelease':                                        'int8',
        'is_special_release':                                   'int8',
        'HasDetections':                                        'int8'
}

Which features have been engineered but not being used in this model?

In [3]:
available_features = list(set(pipe.get_feature_names())-set(dtypes.keys()))
t = [print(f) for f in sorted(available_features)]

Census_MajorOSVersion
Census_MinorOSVersion
FinestAppVersion
MajorAppVersion
MajorAvSigVersion
MajorEngineVersion
MinorAppVersion
MinorAvSigVersion
MinorEngineVersion
RtpStateBitfield_str
Target


In [4]:
train_fn = 'train_r1.csv'
test_fn = 'test_r1.csv'

try:    
    train = pd.read_csv(train_fn, dtype=dtypes)
    test = pd.read_csv_csv(test_fn, dtype=dtypes)
    pre_loaded = True
except FileNotFoundError:
    train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
    test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

    train = train.rename(columns={'HasDetections':'Target'})

drop the columns that we will be replacing with cleaned

In [5]:
cols = sorted(list(set(train.columns) - set(dtypes.keys()) - {'Target'}))
cols

['AVProductStatesIdentifier',
 'AppVersion',
 'AvSigVersion',
 'Census_ChassisTypeName',
 'Census_FirmwareManufacturerIdentifier',
 'Census_FirmwareVersionIdentifier',
 'Census_FlightRing',
 'Census_GenuineStateName',
 'Census_InternalBatteryType',
 'Census_MDC2FormFactor',
 'Census_OEMModelIdentifier',
 'Census_OEMNameIdentifier',
 'Census_OSBranch',
 'Census_OSEdition',
 'Census_OSInstallLanguageIdentifier',
 'Census_OSSkuName',
 'Census_OSWUAutoUpdateOptionsName',
 'Census_PowerPlatformRoleName',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'CityIdentifier',
 'EngineVersion',
 'GeoNameIdentifier',
 'IeVerIdentifier',
 'OrganizationIdentifier',
 'RtpStateBitfield',
 'SmartScreen']

In [6]:
if not_preloaded:
    train = train.drop(columns=cols)
    test = test.drop(columns=cols)

get the columns we dont't have

In [38]:
cols = sorted(list(set(dtypes.keys())-set(train.columns)-{'HasDetections'}))
len(cols), cols

(0, [])

Before doing this, I downloaded all the features into the tmp directory using the AWS CLI. The call to pipe to download the feature checks and pulls from the tmp directory before making the call to S3.

In [35]:
if not pre_loaded:
    joined, failed = [], []

    def fx(c):
        try:
            t = pipe.download_feature(c, cache=True)

            combined = pd.concat([t['train'], t['validate']], axis=0, sort=False)
            test = t['test']

            return combined, test
        except Exception as e:
            failed.append((c, e))

    step = 5
    pbar = tqdm(total=len(cols), desc='joining')
    
    for i in range(0, len(cols), step):
        pool = Pool(step)        
        
        cols_ = list(set(cols[i:i+step]) - set(train.columns))
        pbar.update(step-len(cols_))
        
        if len(cols_) == 0:
            continue
        
        features = list(pool.map(fx, cols_))
        
        for j, t in enumerate(features):
            try:
                combined, ti = t
                c = cols[i+j]
                
                if c in train.columns:
                    pbar.update(1)
                    continue
                    
                train_ = train.merge(combined, on='MachineIdentifier', how='left')
                test_ = test.merge(ti, on='MachineIdentifier', how='left')

                assert len(train) == len(train_)
                train = train_
                del train_

                assert len(test) == len(test_)
                test = test_
                del test_

                joined.append(c)
                
                tmp = features[j]
                features[j] = None
                del tmp, ti, combined
                x = gc.collect()
            except Exception as e:
                failed.append((c, e))

            pbar.update(1)
        
        pool.close(); pool.join()
        del pool, features
        x = gc.collect()
        time.sleep(5)

HBox(children=(IntProgress(value=0, description='joining', max=1, style=ProgressStyle(description_width='initi…

Convert to data types

In [44]:
if not pre_loaded:
    t = [print(ti[0] + ': ' + str(ti[1])) for ti in failed]; del t
    failed_cols = [ti[0] for ti in failed]

    for k, v in tqdm(dtypes.items(), desc='converting dtypes'):
        if k == 'HasDetections' or k == 'MachineIdentifier' or k in failed_cols:
            continue

        try:
            train.loc[:, k] = train.loc[:, k].astype(v)
            test.loc[:, k] = test.loc[:, k].astype(v)
        except KeyError:
            print(f'{k} is {"not " if k not in train.columns else " "}in the cols')

[]


HBox(children=(IntProgress(value=0, description='converting dtypes', max=99, style=ProgressStyle(description_w…

Write to disk

In [48]:
if not preloaded:
    p1 = mul.Process(target=train.to_csv, args=(train_fn,), kwargs=dict(index=None))
    p1.start()

    p2 = mul.Process(target=test.to_csv, args=(test_fn,), kwargs=dict(index=None))
    p2.start()

    p1.join(); p1.terminate(); del p1
    p2.join(); p2.terminate(); del p2

AttributeError: 'Process' object has no attribute 'close'

The model

In [63]:
try:
    del dtypes['MachineIdentifier'], dtypes['HasDetections']
except KeyError:
    pass

features = dtypes.keys()

hparam = tf.contrib.training.HParams(
    model='nffm',
    norm=True,
    batch_norm_decay=0.9,
    hidden_size=[128,128],
    k=8,
    hash_ids=int(2e5),
    batch_size=1024,
    optimizer="adam",
    learning_rate=0.001,
    num_display_steps=1000,
    num_eval_steps=1000,
    epoch=1,
    metric='auc',
    init_method='uniform',
    init_value=0.1,
    feature_nums=len(features),
    kfold=5
)

In [65]:
index = set(range(train.shape[0]))
K_fold=[]

for i in range(hparam.kfold):
    if i == hparam.kfold-1:
        tmp = index 
    else:
        tmp = random.sample(index,int(1.0/hparam.kfold*train.shape[0]))
    index = index-set(tmp)
    
    print("Number:",len(tmp))
    K_fold.append(tmp)
    

for i in range(hparam.kfold):
    print("Fold", i)
    dev_index = K_fold[i]
    dev_index = random.sample(dev_index,int(0.1*len(dev_index)))
    train_index = []
    
    for j in range(hparam.kfold):
        if j != i:
            train_index += K_fold[j]
            
    model = ctrNet.build_model(hparam)
    model.train(
        train_data=(
            train.iloc[train_index][features], 
            train.iloc[train_index]['Target']
        ),
        dev_data=(
            train.iloc[dev_index][features], 
            train.iloc[dev_index]['Target']
        )
    )
    
    print("Training Done! Inference...")
    test['Target'] = np.nan
    if i == 0:
        preds = model.infer(dev_data=(test[features], test['Target']))/hparam.kfold
    else:
        preds += model.infer(dev_data=(test[features], test['Target']))/hparam.kfold

Number: 1784296
Number: 1784296
Number: 1784296
Number: 1784296
Number: 1784299
Fold 0
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 97, 8), 
  Variable:0, (4656, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  Variable_3:0, (), 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.640351 gN 0.25, Fri Feb 15 22:12:52 2019
# Epcho-time 1562.66s Eval AUC 0.719883. Best AUC 0.719883.
  epoch 0 step 2000 lr 0.001 logloss 0.606850 gN 0.16, Fri Feb 15 22:40:15 2019
# Epcho-time 3205.13s Eval AUC 0.727680. Best AUC 0.727680.
  epoch 0 step 3000 lr 0.001 logloss 0.603540 gN 0.15, Fri Feb 15 23:07:49 2019
# Epcho-time 4858.80s Eval AUC 0.730085. Best AUC 0.730085.
  epoch 0 step 4000 lr 0.001 logloss 0.601547 gN 0.14, Fri Feb 15 23:35:23 2019
# Epcho-time 6512.98s Eval AUC 0.731329. Best AUC 0.731329.
  epoch 0 step 5000 lr 0.001 logloss 0.600385 gN 0.14, Sat Feb 16 00:03:02 2019
# Epcho-time 8171.71s Eval AUC 0.733851. Best AUC 0.733851.
  epoch 0 step 6000 lr 0.001 logloss 0.605957 gN 0.16, Sat Feb 16 00:30:35 2019
# Epcho-time 9825.56s Eval AUC 0.730014. Best AUC 0.733851.
# Epcho-time 11426.89s Eval AUC 0.733374. Best AUC 0.733851.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 11513.86s Eval AUC 0.733861.

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.641090 gN 0.26, Sat Feb 16 02:29:55 2019
# Epcho-time 1547.81s Eval AUC 0.722583. Best AUC 0.722583.
  epoch 0 step 2000 lr 0.001 logloss 0.608125 gN 0.17, Sat Feb 16 02:57:07 2019
# Epcho-time 3179.59s Eval AUC 0.729086. Best AUC 0.729086.
  epoch 0 step 3000 lr 0.001 logloss 0.603919 gN 0.16, Sat Feb 16 03:24:27 2019
# Epcho-time 4819.58s Eval AUC 0.731141. Best AUC 0.731141.
  epoch 0 step 4000 lr 0.001 logloss 0.601708 gN 0.15, Sat Feb 16 03:51:50 2019
# Epcho-time 6462.53s Eval AUC 0.733335. Best AUC 0.733335.
  epoch 0 step 5000 lr 0.001 logloss 0.600553 gN 0.14, Sat Feb 16 04:19:12 2019
# Epcho-time 8104.36s Eval AUC 0.735840. Best AUC 0.735840.
  epoch 0 step 6000 lr 0.001 logloss 0.605928 gN 0.17, Sat Feb 16 04:46:35 2019
# Epcho-time 9747.92s Eval AUC 0.732510. Best AUC 0.735840.
# Epcho-time 11340.71s Eval AUC 0.734865. Best AUC 0.735840.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 11426.97s Eval AUC 0.735847.

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.637920 gN 0.25, Sat Feb 16 06:45:31 2019
# Epcho-time 1570.28s Eval AUC 0.721901. Best AUC 0.721901.
  epoch 0 step 2000 lr 0.001 logloss 0.607631 gN 0.17, Sat Feb 16 07:12:48 2019
# Epcho-time 3206.72s Eval AUC 0.727320. Best AUC 0.727320.
  epoch 0 step 3000 lr 0.001 logloss 0.603643 gN 0.16, Sat Feb 16 07:40:27 2019
# Epcho-time 4865.91s Eval AUC 0.731639. Best AUC 0.731639.
  epoch 0 step 4000 lr 0.001 logloss 0.601302 gN 0.15, Sat Feb 16 08:07:49 2019
# Epcho-time 6508.14s Eval AUC 0.733951. Best AUC 0.733951.
  epoch 0 step 5000 lr 0.001 logloss 0.600195 gN 0.14, Sat Feb 16 08:35:27 2019
# Epcho-time 8166.25s Eval AUC 0.735670. Best AUC 0.735670.
  epoch 0 step 6000 lr 0.001 logloss 0.605953 gN 0.17, Sat Feb 16 09:02:52 2019
# Epcho-time 9811.21s Eval AUC 0.732267. Best AUC 0.735670.
# Epcho-time 11404.56s Eval AUC 0.734599. Best AUC 0.735670.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 11491.31s Eval AUC 0.735649.

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.636444 gN 0.24, Sat Feb 16 11:01:57 2019
# Epcho-time 1565.00s Eval AUC 0.720947. Best AUC 0.720947.
  epoch 0 step 2000 lr 0.001 logloss 0.607656 gN 0.17, Sat Feb 16 11:29:23 2019
# Epcho-time 3210.37s Eval AUC 0.726227. Best AUC 0.726227.
  epoch 0 step 3000 lr 0.001 logloss 0.603925 gN 0.16, Sat Feb 16 11:56:57 2019
# Epcho-time 4864.65s Eval AUC 0.729027. Best AUC 0.729027.
  epoch 0 step 4000 lr 0.001 logloss 0.601326 gN 0.15, Sat Feb 16 12:24:26 2019
# Epcho-time 6513.73s Eval AUC 0.729393. Best AUC 0.729393.
  epoch 0 step 5000 lr 0.001 logloss 0.599996 gN 0.14, Sat Feb 16 12:51:59 2019
# Epcho-time 8166.34s Eval AUC 0.733116. Best AUC 0.733116.
  epoch 0 step 6000 lr 0.001 logloss 0.606247 gN 0.17, Sat Feb 16 13:19:27 2019
# Epcho-time 9814.86s Eval AUC 0.731242. Best AUC 0.733116.
# Epcho-time 11412.44s Eval AUC 0.732414. Best AUC 0.733116.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 11498.58s Eval AUC 0.733126.

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.628425 gN 0.23, Sat Feb 16 15:19:44 2019
# Epcho-time 1594.51s Eval AUC 0.721445. Best AUC 0.721445.
  epoch 0 step 2000 lr 0.001 logloss 0.607623 gN 0.17, Sat Feb 16 15:47:22 2019
# Epcho-time 3252.15s Eval AUC 0.726726. Best AUC 0.726726.
  epoch 0 step 3000 lr 0.001 logloss 0.603730 gN 0.16, Sat Feb 16 16:15:18 2019
# Epcho-time 4927.73s Eval AUC 0.730599. Best AUC 0.730599.
  epoch 0 step 4000 lr 0.001 logloss 0.601213 gN 0.15, Sat Feb 16 16:43:00 2019
# Epcho-time 6590.49s Eval AUC 0.731544. Best AUC 0.731544.
  epoch 0 step 5000 lr 0.001 logloss 0.599833 gN 0.14, Sat Feb 16 17:10:58 2019
# Epcho-time 8268.22s Eval AUC 0.734618. Best AUC 0.734618.
  epoch 0 step 6000 lr 0.001 logloss 0.598988 gN 0.14, Sat Feb 16 17:38:39 2019
# Epcho-time 9929.33s Eval AUC 0.735126. Best AUC 0.735126.
# Epcho-time 11557.71s Eval AUC 0.736626. Best AUC 0.736626.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 11649.99s Eval AUC 0.736628.

In [None]:
df = pd.DataFrame()
df['MachineIdentifier'] = test.MachineIdentifier
df['HasDetections'] = preds

df.to_csv('submission.csv', index=None)
df.head()