In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def display_missing(df):
    print('Missing values in training data:')
    for col in df.columns.tolist():
        num_null = df[col].isnull().sum()
        if num_null > 0:
            print(f'{col}: {num_null} missing values')

def fill_missing(df):
    for col in df.columns:
        if df[col].dtype == 'category':
            df.fillna({col: df[col].mode()[0]}, inplace=True)
        else:
            df.fillna({col: df[col].median()}, inplace=True)

def display_tSNE(X, Y):
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(X)
    print(f't-SNE  dimension reduced to {X_tsne.shape[1]}')

    # Create a scatter plot with colorization based on labels
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=Y, cmap='viridis', s=50, edgecolor='k')

    # Add a color bar to the plot for clarity
    plt.colorbar(scatter)

    # Add labels and title
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.title('2D t-SNE with Colorization')

    # Show the plot
    plt.show()

In [None]:
def test_LGBM(X, Y, test_size):
     X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_size, random_state=42)

     # LightGBM model for standard scaled data
     lgb_model_scaled = lgb.LGBMClassifier(random_state=42)
     lgb_model_scaled.fit(X_train, y_train)

     # Make predictions on validation set
     y_pred_scaled = lgb_model_scaled.predict_proba(X_val)[:, 1]
     auc_scaled = roc_auc_score(y_val, y_pred_scaled)

     print(f"---- AUC score LGBM: {auc_scaled} ----")

def test_DTree(X, Y, test_size):
     X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_size, random_state=42)

     # LightGBM model for standard scaled data
     lgb_model_scaled = DecisionTreeClassifier(random_state=42)
     lgb_model_scaled.fit(X_train, y_train)

     # Make predictions on validation set
     y_pred_scaled = lgb_model_scaled.predict_proba(X_val)[:, 1]
     auc_scaled = roc_auc_score(y_val, y_pred_scaled)

     print(f"---- AUC score DTree: {auc_scaled} ----")

def test_RForest(X, Y, test_size):
     X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=test_size, random_state=42)

     # LightGBM model for standard scaled data
     lgb_model_scaled = RandomForestClassifier(n_estimators=100, random_state=42)
     lgb_model_scaled.fit(X_train, y_train)

     # Make predictions on validation set
     y_pred_scaled = lgb_model_scaled.predict_proba(X_val)[:, 1]
     auc_scaled = roc_auc_score(y_val, y_pred_scaled)

     print(f"---- AUC score RForest: {auc_scaled} ----")

In [4]:
INDEX='MachineIdentifier'
LABELS='HasDetections'

In [5]:
dtypes = {
          'MachineIdentifier':                                    'category',
          'ProductName':                                          'category',
          'EngineVersion':                                        'category',
          'AppVersion':                                           'category',
          'AvSigVersion':                                         'category',
          'IsBeta':                                               'Int8',
          'RtpStateBitfield':                                     'float32',
          'IsSxsPassiveMode':                                     'Int8',
          'DefaultBrowsersIdentifier':                            'float32',
          'AVProductStatesIdentifier':                            'float32',
          'AVProductsInstalled':                                  'float32',
          'AVProductsEnabled':                                    'float32',
          'HasTpm':                                               'Int8',
          'CountryIdentifier':                                    'int32',
          'CityIdentifier':                                       'float32',
          'OrganizationIdentifier':                               'float32',
          'GeoNameIdentifier':                                    'float32',
          'LocaleEnglishNameIdentifier':                          'int32',
          'Platform':                                             'category',
          'Processor':                                            'category',
          'OsVer':                                                'category',
          'OsBuild':                                              'int16',
          'OsSuite':                                              'int16',
          'OsPlatformSubRelease':                                 'category',
          'OsBuildLab':                                           'category',
          'SkuEdition':                                           'category',
          'IsProtected':                                          'float32',
          'AutoSampleOptIn':                                      'Int8',
          'PuaMode':                                              'category',
          'SMode':                                                'float32',
          'IeVerIdentifier':                                      'float32',
          'SmartScreen':                                          'category',
          'Firewall':                                             'float32',
          'UacLuaenable':                                         'float64',
          'Census_MDC2FormFactor':                                'category',
          'Census_DeviceFamily':                                  'category',
          'Census_OEMNameIdentifier':                             'float32',
          'Census_OEMModelIdentifier':                            'float32',
          'Census_ProcessorCoreCount':                            'float32',
          'Census_ProcessorManufacturerIdentifier':               'float32',
          'Census_ProcessorModelIdentifier':                      'float32',
          'Census_ProcessorClass':                                'category',
          'Census_PrimaryDiskTotalCapacity':                      'float32',
          'Census_PrimaryDiskTypeName':                           'category',
          'Census_SystemVolumeTotalCapacity':                     'float32',
          'Census_HasOpticalDiskDrive':                           'Int8',
          'Census_TotalPhysicalRAM':                              'float32',
          'Census_ChassisTypeName':                               'category',
          'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float32',
          'Census_InternalPrimaryDisplayResolutionHorizontal':    'float32',
          'Census_InternalPrimaryDisplayResolutionVertical':      'float32',
          'Census_PowerPlatformRoleName':                         'category',
          'Census_InternalBatteryType':                           'category',
          'Census_InternalBatteryNumberOfCharges':                'float32',
          'Census_OSVersion':                                     'category',
          'Census_OSArchitecture':                                'category',
          'Census_OSBranch':                                      'category',
          'Census_OSBuildNumber':                                 'int32',
          'Census_OSBuildRevision':                               'int32',
          'Census_OSEdition':                                     'category',
          'Census_OSSkuName':                                     'category',
          'Census_OSInstallTypeName':                             'category',
          'Census_OSInstallLanguageIdentifier':                   'float32',
          'Census_OSUILocaleIdentifier':                          'int32',
          'Census_OSWUAutoUpdateOptionsName':                     'category',
          'Census_IsPortableOperatingSystem':                     'Int8',
          'Census_GenuineStateName':                              'category',
          'Census_ActivationChannel':                             'category',
          'Census_IsFlightingInternal':                           'float32',
          'Census_IsFlightsDisabled':                             'float32',
          'Census_FlightRing':                                    'category',
          'Census_ThresholdOptIn':                                'float32',
          'Census_FirmwareManufacturerIdentifier':                'float32',
          'Census_FirmwareVersionIdentifier':                     'float32',
          'Census_IsSecureBootEnabled':                           'Int8',
          'Census_IsWIMBootEnabled':                              'float32',
          'Census_IsVirtualDevice':                               'float32',
          'Census_IsTouchEnabled':                                'Int8',
          'Census_IsPenCapable':                                  'Int8',
          'Census_IsAlwaysOnAlwaysConnectedCapable':              'float32',
          'Wdft_IsGamer':                                         'float32',
          'Wdft_RegionIdentifier':                                'float32',
          'HasDetections':                                        'Int8'
     }

In [6]:
print("loading training data")
train_df = pd.read_csv('train.csv', dtype=dtypes, nrows=1000000)
train_df.drop(columns=INDEX, axis=1, inplace=True)                      # ignore machine identifier
has_detections = train_df.pop(LABELS)

loading training data


In [7]:
# Fill missing
print("filling missing entries")
fill_missing(train_df)
display_missing(train_df)

filling missing entries
Missing values in training data:


In [8]:
# Label encode categorical variables
print("label encoding categorical features")
categorical = [col for col in train_df.columns if train_df[col].dtype == 'category']
for col in categorical:
     le = LabelEncoder()
     train_df[col] = le.fit_transform(train_df[col].astype(str))

label encoding categorical features


In [None]:
# Set features and labels
X = train_df
Y = has_detections

In [10]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
del X

Test with all features

In [17]:
test_DTree(X_scaled, Y, test_size=0.2)
test_LGBM(X_scaled, Y, test_size=0.2)
test_RForest(X_scaled, Y, test_size=0.2)

---- AUC score DTree: 0.5720658455326338 ----
[LightGBM] [Info] Number of positive: 399833, number of negative: 400167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5214
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499791 -> initscore=-0.000835
[LightGBM] [Info] Start training from score -0.000835
---- AUC score LGBM: 0.71345124843805 ----


NameError: name 'test_RForest' is not defined

# Dimensionality Reduction

In [13]:
# Auto encoder
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Define the Autoencoder model
def build_autoencoder(input_dim, encoding_dim):
    # Input layer
    input_layer = layers.Input(shape=(input_dim,))

    # Encoder: Dense layers to compress input
    encoded = layers.Dense(encoding_dim, activation='relu')(input_layer)

    # Decoder: Dense layers to reconstruct input
    decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)

    # Create the autoencoder model
    autoencoder = models.Model(input_layer, decoded)

    # Create the encoder model (only encoding part)
    encoder = models.Model(input_layer, encoded)

    # Compile the autoencoder model
    autoencoder.compile(optimizer='adam', loss='mean_squared_error')

    return autoencoder, encoder

# Set input and encoding dimensions
input_dim = X_scaled.shape[1]
encoding_dim = 40

# Build the model
autoencoder, encoder = build_autoencoder(input_dim, encoding_dim)

with tf.device('gpu'):
     # Train the model
     autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=256, shuffle=True)

     # To get the encoded representation of some input data:
     X_low = encoder.predict(X_scaled)

Num GPUs Available:  0
Epoch 1/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.7950
Epoch 2/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.7027
Epoch 3/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6811
Epoch 4/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6623
Epoch 5/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6885
Epoch 6/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6933
Epoch 7/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6723
Epoch 8/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.6661
Epoch 9/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.7067
Epoch 10/50
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━

In [14]:
# PCA
# pca = PCA(n_components=0.95)
# X_low = pca.fit_transform(X_scaled)
# print(f'PCA dimension reduced to {X_low.shape[1]}')

In [15]:
# This is way too slow with a million points
# display t-SNE
# display_tSNE(X_low, Y)

In [16]:
test_DTree(X_low, Y, test_size=0.2)
test_LGBM(X_low, Y, test_size=0.2)

---- AUC score DTree: 0.5553193943627759 ----
[LightGBM] [Info] Number of positive: 399833, number of negative: 400167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10200
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499791 -> initscore=-0.000835
[LightGBM] [Info] Start training from score -0.000835
---- AUC score LGBM: 0.6788518569040742 ----
