In [1]:
import os
import gc
import re
import datetime
import warnings

import numpy as np
import pandas as pd
import lightgbm as lgbm
from scipy.sparse import vstack, csr_matrix, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

gc.enable()
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
SEED = 2019
DATA_DIR = '/disk/ms_malware/'

In [3]:
# Load data
train = pd.read_csv(os.path.join(DATA_DIR, 'preprocessed_train.csv.gz'), compression='gzip')
test = pd.read_csv(os.path.join(DATA_DIR, 'preprocessed_test.csv.gz'), compression='gzip')

# Remove useless columns from dataframe
y_train = np.array(train['HasDetections'])
train_ids = train.index
test_ids  = test.index

train.drop(['HasDetections', 'MachineIdentifier'], axis=1, inplace=True)
test.drop(['MachineIdentifier'], axis=1, inplace=True)

gc.collect()
train.head()

Unnamed: 0,ProductName,EngineVersion,AppVersion,AvSigVersion,IsBeta,RtpStateBitfield,IsSxsPassiveMode,DefaultBrowsersIdentifier,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,AutoSampleOptIn,PuaMode,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_ProcessorClass,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryType,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSEdition,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightingInternal,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,CosSim,w2vApp_0,w2vApp_1,w2vApp_2,w2vApp_3,w2vApp_4,w2vEng_0,w2vEng_1,w2vEng_2,w2vEng_3,w2vEng_4,ProductNamePosition,EngineVersionPosition,AppVersionPosition,OsBuildLabPosition,AvSigVersion2Position,Census_OSVersion3Position,SmartScreen*Platform,SmartScreen*Processor,SmartScreen*Wdft_IsGamer,SmartScreen*AVProductsInstalled,Platform*Processor,Platform*Wdft_IsGamer,Platform*AVProductsInstalled,Processor*Wdft_IsGamer,Processor*AVProductsInstalled,Wdft_IsGamer*AVProductsInstalled
0,5,0,0,0,1,7,1,2178,26978,2,2,2,145,24112,10,226,112,1,2,1,7,4,5,307,7,2,1,2,0,28,21,2,2,3,2,1231,0,31,6,1190,4,55,1,28627,1,6,29,26,8,6,2,72,519,0,1,13,122,0,22,18,7,19,20,6,1,2,3,3,1,8,3,544,0,1,3,1,1,1,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,38,36,41,70,2,1,2,4,4,1
1,5,0,30,6466,1,7,1,2178,26978,2,2,2,216,41795,10,23,216,1,2,1,7,4,5,307,7,2,1,2,0,28,21,2,2,9,2,1231,0,31,6,0,4,55,1,8055,1,6,43,10,6,3,4,72,1,412,1,13,122,2,22,18,3,38,82,6,1,3,3,3,1,5,3,544,0,1,3,1,1,1,1,1,14,0,4,2,10,5,3,0,0,0,0,0,0,17,0,51,0,0,38,36,41,70,2,1,2,4,4,1
2,5,0,0,0,1,7,1,2178,26978,2,2,2,208,46444,10,257,199,1,2,1,7,13,5,307,5,2,1,2,0,28,18,2,2,3,2,2230,162982,31,6,955,4,25,2,10640,1,6,29,32,12,9,2,72,519,0,1,13,122,0,5,3,7,37,81,3,1,2,2,3,1,8,3,110,39165,1,3,1,1,1,1,1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,0,31,29,34,55,2,1,2,4,4,1
3,5,0,0,0,1,7,1,2178,26978,2,2,2,210,62100,52,21,147,1,2,1,7,4,5,307,7,2,1,2,0,28,10,2,2,3,2,397,131543,31,6,1151,4,42,3,0,1,6,41,25,6,3,2,72,519,0,1,13,122,0,22,18,7,9,115,3,1,2,2,3,1,8,3,294,9298,1,3,1,1,1,1,1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,16,17,23,2,1,2,4,4,1
4,5,0,0,0,1,7,1,2178,26978,2,2,2,90,76743,52,198,228,1,2,1,7,13,5,307,5,2,1,2,0,28,18,2,2,9,2,397,0,31,6,1278,4,55,1,7811,1,8,47,11,6,3,4,62,1,0,1,13,122,0,5,3,8,38,82,3,1,2,3,1,1,8,1,294,9115,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,29,34,55,2,1,2,4,4,1


In [4]:
#Fit OneHotEncoder
ohe = OneHotEncoder(categories='auto', sparse=True, dtype='uint8').fit(pd.concat([train, test], axis=0))

#Transform data using small groups to reduce memory usage
m = 100000
train = vstack([ohe.transform(train[i*m:(i+1)*m]) for i in range(train.shape[0] // m + 1)])
test = vstack([ohe.transform(test[i*m:(i+1)*m]) for i in range(test.shape[0] // m + 1)])
save_npz(os.path.join(DATA_DIR, 'train.npz'), train, compressed=True)
save_npz(os.path.join(DATA_DIR, 'test.npz'), test, compressed=True)

del ohe
gc.collect()
print(train.shape)

(8921483, 7633)


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

counter = 0
auc_metrics = []
test_result = np.zeros(test_ids.shape[0])

for train_index, test_index in skf.split(train_ids, y_train):
    print('Fold {}'.format(counter+1))
    train = load_npz(os.path.join(DATA_DIR, 'train.npz'))
    X_fit = vstack([train[train_index[i*m:(i+1)*m]] for i in range(train_index.shape[0] // m + 1)])
    X_val = vstack([train[test_index[i*m:(i+1)*m]]  for i in range(test_index.shape[0] //  m + 1)])
    X_fit, X_val = csr_matrix(X_fit, dtype='float32'), csr_matrix(X_val, dtype='float32')
    y_fit, y_val = y_train[train_index], y_train[test_index]
    
    del train
    gc.collect()
    
    lgb_model = lgbm.LGBMClassifier(max_depth=-1,
                                    n_estimators=30000,
                                    learning_rate=0.05,
                                    num_leaves=2**12-1,
                                    colsample_bytree=.19,
                                    objective='binary', 
                                    n_jobs=10)

    lgb_model.fit(X_fit, y_fit, eval_metric='logloss', 
                  eval_set=[(X_val, y_val)], 
                  verbose=100, early_stopping_rounds=100)

    auc_ = roc_auc_score(y_val, lgb_model.predict_proba(X_val)[:, 1])
    auc_metrics.append(auc_)
    del X_fit, X_val, y_fit, y_val, train_index, test_index, auc_
    gc.collect()

    print('valid_0\'s auc: {0:.6f}'.format(auc_))
    
    test = load_npz(os.path.join(DATA_DIR, 'test.npz'))
    test = csr_matrix(test, dtype='float32')
    test_result += lgb_model.predict_proba(test)[:, 1]
    del test
    gc.collect()

    counter += 1

print('Done')

In [None]:
val_auc = np.mean(auc_metrics)
today_date = re.sub('-', '', str(datetime.date.today())[5:])

sub = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
sub['HasDetections'] = test_result / counter
sub.to_csv(os.path.join(DATA_DIR, 'submit/lgbm_{0}_{1:.4f}.csv.gz'.format(today_date, val_auc)), 
           index=False, compression='gzip')
print('lgbm_{0}_{1:.4f}.csv.gz'.format(today_date, val_auc))