In [2]:
import gc
import multiprocessing as mul
from multiprocessing.dummy import Pool as TPool
import os

import cache_magic
import lightgbm as lgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
#import modin.pandas as pd
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from IPython.core.interactiveshell import InteractiveShell

import dankypipe.pipe as pipe
from dankypipe.utils import *

InteractiveShell.ast_node_interactivity = 'all'

In [3]:
dtypes = DTypes({
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion_float':                                  'float16',
        'AppVersion_float':                                     'float16',
        'AvSigVersion_float':                                   'float16',
        'IsBeta':                                               'int8',
        'RtpStateBitfield_1':                                   'int8',
        'RtpStateBitfield_2':                                   'int8',
        'RtpStateBitfield_3':                                   'int8',
        'RtpStateBitfield_4':                                   'int8',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'category',
        'AVProductStatesIdentifier_filled':                     'category',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'category',
        'CityIdentifier_filled':                                'category',
        'OrganizationIdentifier_filled':                        'category',
        'LocaleEnglishNameIdentifier':                          'category',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier_filled':                               'float16',
        'SmartScreen_filled':                                   'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2_FormFactor_clean':                         'category',
        'Census_MDC2_FormFactor_isValid':                       'int8',
        'Census_FFMatch':                                       'int8',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier_filled':                      'float16',
        'Census_OEMModelIdentifier_filled':                     'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier_filled':        'category',
        'Census_ProcessorModelIdentifier_filled':               'category',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName_clean':                         'category',
        'Census_ChassisTypeName_isValid':                       'int8',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName_filled':                  'category',
        'Census_InternalBatteryType_reduced':                   'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition_clean':                               'category',
        'OS_Reduced_Media':                                     'int8',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier_filled':            'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSEdSkuMatch':                                  'int8',
        'Census_OSVersion_float':                               'float16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName_filled':                       'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing_filled':                             'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier_filled':         'float16',
        'Census_FirmwareVersionIdentifier_filled':              'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'branch_ver':                                           'category',
        'subrel_ver':                                           'category',
        'subrel_ver_num':                                       'int',
        'branch_release_num':                                   'int',
        'is_svc_release':                                       'int8',
        'is_escrow_release':                                    'int8',
        'is_sec_release':                                       'int8',
        'is_st1_release':                                       'int8',
        'is_prerelease':                                        'int8',
        'is_special_release':                                   'int8',
        'HasDetections':                                        'int8'
    }
)

def get_feat(n, t):
    df = pd.read_csv(n, dtypes=t)
    dtypes.add_type(n, t)
    
    return df

In [4]:
def load(s):
    train_cleaned = pd.read_csv(dp(f'{s}_r1.csv'), dtype=dtypes.dict)
    train_raw = pd.read_pickle(dp(f'{s}.pickle'))

    cols = list(set(train_raw.columns) - set(train_cleaned.columns)) + ['MachineIdentifier']
    df = train_cleaned.merge(train_raw[cols], on='MachineIdentifier', how='left')
    
    df['avsig_dt'] = df.AvSigVersion.apply(mx)
    df = df.dropna(subset=['avsig_dt'], axis=0)
    df = df.sort_values(by='avsig_dt')
    return df

%cache train = load('train')
%cache test = load('test')

Loading cached value for variable 'train'. Time since caching: 4:05:30.520683
Loading cached value for variable 'test'. Time since caching: 3:58:00.001885


In [None]:
to_drop = [
    'Census_OSWUAutoUpdateOptionsName_filled'
]

train = train.drop(columns=to_drop, errors='ignore')
test = test.drop(columns=to_drop, errors='ignore')

In [None]:
to_load = [
    {'Census_OSWUAutoUpdateOptionsName_filled': 'category'}
]

for feat in tqdm(to_load):
    c, t = feat.key(), feat[feat.key()]
    
    if c in train.columns and c in test.columns:
        continue
    elif c not in train.columns and c in test.columns:
        print(f'{c} in test but not train')
        continue
    elif c in train.columns and c not in test.columns:
        print(f'{c} in train but not test')
        continue
        
    x = pd.read_csv(dp(f'{c}.csv'), dtype=t)
    dtypes.add_type(feat)
    
    a, b = len(train), len(test)
    train_ = train.merge(x, on='MachineIdentifier', how='inner')
    test_ = test.merge(x, on='MachineIdentifier', how='inner')
    
    assert a == len(train) and b == len(test)
    train = train_
    test = test_
    
    del train_, test_
    gc.collect()

In [5]:
split_idx = int(train.shape[0]*.8); split_idx

y = train.HasDetections

train = train.drop(columns='HasDetections')
x = train.iloc[:split_idx, :]
val = train.iloc[split_idx:, :]

7137132

In [6]:
pcols(train)

AVProductStatesIdentifier
AVProductStatesIdentifier_filled
AVProductsEnabled
AVProductsInstalled
AppVersion
AppVersion_float
AutoSampleOptIn
AvSigVersion
AvSigVersion_float
Census_ActivationChannel
Census_ChassisTypeName
Census_ChassisTypeName_clean
Census_ChassisTypeName_isValid
Census_DeviceFamily
Census_FFMatch
Census_FirmwareManufacturerIdentifier
Census_FirmwareManufacturerIdentifier_filled
Census_FirmwareVersionIdentifier
Census_FirmwareVersionIdentifier_filled
Census_FlightRing
Census_FlightRing_filled
Census_GenuineStateName
Census_GenuineStateName_filled
Census_HasOpticalDiskDrive
Census_InternalBatteryNumberOfCharges
Census_InternalBatteryType
Census_InternalBatteryType_reduced
Census_InternalPrimaryDiagonalDisplaySizeInInches
Census_InternalPrimaryDisplayResolutionHorizontal
Census_InternalPrimaryDisplayResolutionVertical
Census_IsAlwaysOnAlwaysConnectedCapable
Census_IsFlightingInternal
Census_IsFlightsDisabled
Census_IsPenCapable
Census_IsPortableOperatingSystem
Census_IsS

In [None]:
lgb_train = lgb.Dataset(x, )