In [1]:
!git clone https://github.com/guoday/ctrNet-tool.git
!cp -r ctrNet-tool/* ./

Cloning into 'ctrNet-tool'...
remote: Enumerating objects: 168, done.[K
remote: Counting objects: 100% (168/168), done.[K
remote: Compressing objects: 100% (127/127), done.[K
remote: Total 168 (delta 86), reused 92 (delta 38), pack-reused 0[K
Receiving objects: 100% (168/168), 8.42 MiB | 782.00 KiB/s, done.
Resolving deltas: 100% (86/86), done.
Checking connectivity... done.


In [2]:
import ctrNet
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from src import misc_utils as utils
import os
import gc
random.seed(2019)
np.random.seed(2019)

In [None]:
data_path = './data/'

In [3]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }
print('Loading Train and Test Data.\n')
train = pd.read_csv(data_path+'train.csv', dtype=dtypes, low_memory=True)
train['MachineIdentifier'] = train.index.astype('uint32')
test  = pd.read_csv(data_path+'test.csv',  dtype=dtypes, low_memory=True)
test['MachineIdentifier']  = test.index.astype('uint32')
test['HasDetections']=[0]*len(test)

Loading Train and Test Data.



In [4]:
def make_bucket(data,num=10):
    data.sort()
    bins=[]
    for i in range(num):
        bins.append(data[int(len(data)*(i+1)//num)-1])
    return bins
float_features=['Census_SystemVolumeTotalCapacity','Census_PrimaryDiskTotalCapacity']
for f in float_features:
    train[f]=train[f].fillna(1e10)
    test[f]=test[f].fillna(1e10)
    data=list(train[f])+list(test[f])
    bins=make_bucket(data,num=50)
    train[f]=np.digitize(train[f],bins=bins)
    test[f]=np.digitize(test[f],bins=bins)
    
features=train.columns.tolist()[1:-1]

In [5]:
data=pd.concat([train,test])

del train
del test

print(data.shape)
for col in features:
    group_by_col=data.groupby([col]).size()
    df_group_by_col = pd.DataFrame(group_by_col,columns=[col+'count'])
    df_group_by_col = df_group_by_col.reset_index()
    
    sum_temp = df_group_by_col[col+'count'].sum()
    df_group_by_col[col+'rate'] = df_group_by_col[col+'count']/sum_temp
    data = pd.merge(data,df_group_by_col,on=col,how='left')

    
print(data.shape)
train = data[:8921483]
test = data[-7853253:]

del data
gc.collect()

(16774736, 83)
(16774736, 164)


201824

In [6]:
train.head()

Unnamed: 0,MachineIdentifier,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,...,Census_FirmwareManufacturerIdentifiercount,Census_FirmwareVersionIdentifiercount,Census_IsSecureBootEnabledcount,Census_IsWIMBootEnabledcount,Census_IsVirtualDevicecount,Census_IsTouchEnabledcount,Census_IsPenCapablecount,Census_IsAlwaysOnAlwaysConnectedCapablecount,Wdft_IsGamercount,Wdft_RegionIdentifiercount
0,0,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1735.0,0,7.0,0,,53447.0,...,2300471.0,841.0,8894050,,16614168.0,14725738,16137100,15682103.0,11476755.0,3251921.0
1,1,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,0,7.0,0,,53447.0,...,2300471.0,1681.0,8894050,,16614168.0,14725738,16137100,15682103.0,11476755.0,504963.0
2,2,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1341.0,0,7.0,0,,53447.0,...,5141158.0,12904.0,8894050,,16614168.0,14725738,16137100,15682103.0,11476755.0,2549287.0
3,3,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1527.0,0,7.0,0,,53447.0,...,1789480.0,61193.0,8894050,,16614168.0,14725738,16137100,15682103.0,11476755.0,2549287.0
4,4,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1379.0,0,7.0,0,,53447.0,...,1789480.0,18540.0,8894050,5592324.0,16614168.0,14725738,16137100,15682103.0,11476755.0,2341900.0


In [7]:
features2 = train.columns
features2 = features2.drop('HasDetections')
features2 = features2.drop('MachineIdentifier')
features=features2.tolist()

In [8]:
hparam=tf.contrib.training.HParams(
            model='nffm',
            norm=True,
            batch_norm_decay=0.9,
            hidden_size=[256,256],
            k=8,
            hash_ids=int(2e5),
            batch_size=1024,
            optimizer="adam",
            learning_rate=0.001,
            num_display_steps=1000,
            num_eval_steps=1000,
            epoch=1,
            metric='auc',
            init_method='uniform',
            init_value=0.1,
            feature_nums=len(features),
            kfold=5)
utils.print_hparams(hparam)

  batch_norm_decay=0.9
  batch_size=1024
  epoch=1
  feature_nums=162
  hash_ids=200000
  hidden_size=[128, 128]
  init_method=uniform
  init_value=0.1
  k=8
  kfold=5
  learning_rate=0.001
  metric=auc
  model=nffm
  norm=True
  num_display_steps=1000
  num_eval_steps=1000
  optimizer=adam


In [9]:
index=set(range(train.shape[0]))
K_fold=[]
for i in range(hparam.kfold):
    if i == hparam.kfold-1:
        tmp=index
    else:
        tmp=random.sample(index,int(1.0/hparam.kfold*train.shape[0]))
    index=index-set(tmp)
    print("Number:",len(tmp))
    K_fold.append(tmp)
    

for i in range(hparam.kfold):
    print("Fold",i)
    dev_index=K_fold[i]
    dev_index=random.sample(dev_index,int(0.1*len(dev_index)))
    train_index=[]
    for j in range(hparam.kfold):
        if j!=i:
            train_index+=K_fold[j]
    model=ctrNet.build_model(hparam)
    model.train(train_data=(train.iloc[train_index][features],train.iloc[train_index]['HasDetections']),\
                dev_data=(train.iloc[dev_index][features],train.iloc[dev_index]['HasDetections']))
    print("Training Done! Inference...")
    if i==0:
        preds=model.infer(dev_data=(test[features],test['HasDetections']))/hparam.kfold
    else:
        preds+=model.infer(dev_data=(test[features],test['HasDetections']))/hparam.kfold

Number: 1784296
Number: 1784296
Number: 1784296
Number: 1784296
Number: 1784299
Fold 0
# Trainable variables
  emb_v1:0, (200000, 1), 
  emb_v2:0, (200000, 162, 8), 
  Variable:0, (13041, 128), 
  norm_0/beta:0, (128,), 
  norm_0/gamma:0, (128,), 
  Variable_1:0, (128, 128), 
  norm_1/beta:0, (128,), 
  norm_1/gamma:0, (128,), 
  Variable_2:0, (128, 1), 
  Variable_3:0, (), 


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  epoch 0 step 1000 lr 0.001 logloss 0.638497 gN 0.26, Tue Mar  5 11:35:44 2019
# Epcho-time 450.74s Eval AUC 0.722382. Best AUC 0.722382.
  epoch 0 step 2000 lr 0.001 logloss 0.603216 gN 0.16, Tue Mar  5 11:45:01 2019
# Epcho-time 1007.97s Eval AUC 0.731769. Best AUC 0.731769.
  epoch 0 step 3000 lr 0.001 logloss 0.599453 gN 0.15, Tue Mar  5 11:54:37 2019
# Epcho-time 1583.11s Eval AUC 0.736219. Best AUC 0.736219.
  epoch 0 step 4000 lr 0.001 logloss 0.596255 gN 0.14, Tue Mar  5 12:05:14 2019
# Epcho-time 2220.14s Eval AUC 0.737925. Best AUC 0.737925.
  epoch 0 step 5000 lr 0.001 logloss 0.594547 gN 0.14, Tue Mar  5 12:16:01 2019
# Epcho-time 2867.55s Eval AUC 0.739280. Best AUC 0.739280.
  epoch 0 step 6000 lr 0.001 logloss 0.593547 gN 0.13, Tue Mar  5 12:26:40 2019
# Epcho-time 3506.59s Eval AUC 0.740426. Best AUC 0.740426.
# Epcho-time 4138.97s Eval AUC 0.741141. Best AUC 0.741141.
INFO:tensorflow:Restoring parameters from model_tmp/model
# Epcho-time 4231.08s Eval AUC 0.741147. Be

In [11]:
submission = pd.read_csv(data_path + 'sample_submission.csv')
submission['HasDetections'] = preds
print(submission['HasDetections'].head())
submission.to_csv(data_path+'nffm_submission_add_feature_count_rate.csv', index=False)

0    0.511876
1    0.559362
2    0.613352
3    0.335103
4    0.440912
Name: HasDetections, dtype: float32


In [1]:
(0.741147 + 0.744572 + 0.744233 + 0.741577 + 0.743965)/5

0.7430988