# Setup before analysis

Mount Google drive to import dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Import library

In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

Declare data types to save memory

In [None]:
dtypes = {
        'MachineIdentifier':                                    'category',
        'ProductName':                                          'category',
        'EngineVersion':                                        'category',
        'AppVersion':                                           'category',
        'AvSigVersion':                                         'category',
        'IsBeta':                                               'int8',
        'RtpStateBitfield':                                     'float16',
        'IsSxsPassiveMode':                                     'int8',
        'DefaultBrowsersIdentifier':                            'float16',
        'AVProductStatesIdentifier':                            'float32',
        'AVProductsInstalled':                                  'float16',
        'AVProductsEnabled':                                    'float16',
        'HasTpm':                                               'int8',
        'CountryIdentifier':                                    'int16',
        'CityIdentifier':                                       'float32',
        'OrganizationIdentifier':                               'float16',
        'GeoNameIdentifier':                                    'float16',
        'LocaleEnglishNameIdentifier':                          'int8',
        'Platform':                                             'category',
        'Processor':                                            'category',
        'OsVer':                                                'category',
        'OsBuild':                                              'int16',
        'OsSuite':                                              'int16',
        'OsPlatformSubRelease':                                 'category',
        'OsBuildLab':                                           'category',
        'SkuEdition':                                           'category',
        'IsProtected':                                          'float16',
        'AutoSampleOptIn':                                      'int8',
        'PuaMode':                                              'category',
        'SMode':                                                'float16',
        'IeVerIdentifier':                                      'float16',
        'SmartScreen':                                          'category',
        'Firewall':                                             'float16',
        'UacLuaenable':                                         'float32',
        'Census_MDC2FormFactor':                                'category',
        'Census_DeviceFamily':                                  'category',
        'Census_OEMNameIdentifier':                             'float16',
        'Census_OEMModelIdentifier':                            'float32',
        'Census_ProcessorCoreCount':                            'float16',
        'Census_ProcessorManufacturerIdentifier':               'float16',
        'Census_ProcessorModelIdentifier':                      'float16',
        'Census_ProcessorClass':                                'category',
        'Census_PrimaryDiskTotalCapacity':                      'float32',
        'Census_PrimaryDiskTypeName':                           'category',
        'Census_SystemVolumeTotalCapacity':                     'float32',
        'Census_HasOpticalDiskDrive':                           'int8',
        'Census_TotalPhysicalRAM':                              'float32',
        'Census_ChassisTypeName':                               'category',
        'Census_InternalPrimaryDiagonalDisplaySizeInInches':    'float16',
        'Census_InternalPrimaryDisplayResolutionHorizontal':    'float16',
        'Census_InternalPrimaryDisplayResolutionVertical':      'float16',
        'Census_PowerPlatformRoleName':                         'category',
        'Census_InternalBatteryType':                           'category',
        'Census_InternalBatteryNumberOfCharges':                'float32',
        'Census_OSVersion':                                     'category',
        'Census_OSArchitecture':                                'category',
        'Census_OSBranch':                                      'category',
        'Census_OSBuildNumber':                                 'int16',
        'Census_OSBuildRevision':                               'int32',
        'Census_OSEdition':                                     'category',
        'Census_OSSkuName':                                     'category',
        'Census_OSInstallTypeName':                             'category',
        'Census_OSInstallLanguageIdentifier':                   'float16',
        'Census_OSUILocaleIdentifier':                          'int16',
        'Census_OSWUAutoUpdateOptionsName':                     'category',
        'Census_IsPortableOperatingSystem':                     'int8',
        'Census_GenuineStateName':                              'category',
        'Census_ActivationChannel':                             'category',
        'Census_IsFlightingInternal':                           'float16',
        'Census_IsFlightsDisabled':                             'float16',
        'Census_FlightRing':                                    'category',
        'Census_ThresholdOptIn':                                'float16',
        'Census_FirmwareManufacturerIdentifier':                'float16',
        'Census_FirmwareVersionIdentifier':                     'float32',
        'Census_IsSecureBootEnabled':                           'int8',
        'Census_IsWIMBootEnabled':                              'float16',
        'Census_IsVirtualDevice':                               'float16',
        'Census_IsTouchEnabled':                                'int8',
        'Census_IsPenCapable':                                  'int8',
        'Census_IsAlwaysOnAlwaysConnectedCapable':              'float16',
        'Wdft_IsGamer':                                         'float16',
        'Wdft_RegionIdentifier':                                'float16',
        'HasDetections':                                        'int8'
        }

Declare function to convert types for features.

In [None]:
def convert_types(df):
    # Convert data types to reduce memory
    for c in df:
        col_type = str(df[c].dtypes)
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

        # Convert objects to category
        if col_type == 'object':
            df[c] = df[c].astype('category')

        # numerics
        elif col_type in numerics:
            c_min = df[c].min()
            c_max = df[c].max()
            if col_type[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[c] = df[c].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[c] = df[c].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[c] = df[c].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[c] = df[c].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[c] = df[c].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[c] = df[c].astype(np.float32)
                else:
                    df[c] = df[c].astype(np.float64)

    return df

# Load data

In [None]:
# load train data set
# use dask to load faster
file = './drive/MyDrive/train.csv'
ddf = dd.read_csv(file, dtype = dtypes)
df = ddf.compute()

In [None]:
# convert types
df = convert_types(df)

# Preparation for output

Generate output dataframe to describe the relevance of the features.

In [None]:
output_df = pd.DataFrame(index=df.columns, columns=['Relevant'])
output_df['Relevant'] = 1
output_df['FeatureType'] = "NA"

Categorize each feature into Category, Numeric, and Boolean.

In [None]:
numeric_type = ['Census_InternalBatteryNumberOfCharges',
                'Census_InternalPrimaryDiagonalDisplaySizeInInches',
                'Census_PrimaryDiskTotalCapacity',
                'Census_ProcessorCoreCount',
                'Census_SystemVolumeTotalCapacity',
                'Census_TotalPhysicalRAM']
for col in df.columns:
  if col in numeric_type:
    output_df.loc[col, 'FeatureType'] = 'Numeric'
  elif df[col].nunique() == 2:
    output_df.loc[col, 'FeatureType'] = 'Boolean'
  else:
    output_df.loc[col, 'FeatureType'] = 'Category'

Define a function to change feature's relevance.

In [None]:
def change_to_irrelevant(df1, df2):
  for idx in df2.index:
      if idx in df1.index:
          df1.loc[idx, 'Relevant'] = 0


# Features with many NaN

Count NaNs in each feature and find its frequency. We considered NaN frequency over 0.5 as invalid feature and ignore the feature.

In [None]:
nan_count = df.isnull().sum().to_frame('count')
nan_count['count'] = nan_count['count'].div(8921483).round(2)
irrelevant_feature = nan_count[nan_count['count'] > 0.5]
irrelevant_feature

Unnamed: 0,count
DefaultBrowsersIdentifier,0.95
PuaMode,1.0
Census_ProcessorClass,1.0
Census_InternalBatteryType,0.71
Census_IsFlightingInternal,0.83
Census_ThresholdOptIn,0.64
Census_IsWIMBootEnabled,0.63


Change output feature relevance according to NaN frequency.

In [None]:
change_to_irrelevant(output_df, irrelevant_feature)

# Unbalanced feature

Define a function to calculate if the target feature is balanced. Here, we calculate a balance ratio between max count input and total input count. Ratio close to 1 indicates more imbalance.

In [None]:
def assess_balance(df, column):
    value_counts = df[column].value_counts()
    max_count = value_counts.max()
    balance_ratio = max_count / len(df)
    return balance_ratio

Calculate balance ratio for each feature and store it to dataframe.

In [None]:
balance_ratios_df = pd.DataFrame(index=df.columns, columns=['balance_ratio'])
assess_balance(df, 'IsBeta')
for col in df.columns:
  ratio = assess_balance(df, col)
  balance_ratios_df.loc[col, 'balance_ratio'] = ratio

We decide count of single input over 98% is imbalanced.

In [None]:
unbalanced_df = balance_ratios_df[balance_ratios_df['balance_ratio'] > 0.98]

In [None]:
unbalanced_df

Unnamed: 0,balance_ratio
ProductName,0.989356
IsBeta,0.999992
IsSxsPassiveMode,0.982666
HasTpm,0.987971
AutoSampleOptIn,0.999971
UacLuaenable,0.992718
Census_DeviceFamily,0.998383
Census_IsPortableOperatingSystem,0.999455
Census_IsFlightsDisabled,0.981997
Census_IsVirtualDevice,0.991185


Change output feature relevance according to balance ratio.

In [None]:
change_to_irrelevant(output_df, unbalanced_df)

# Export EDA output

In [None]:
output_df.index = output_df.index.rename('Feature')
output_df.to_excel('./data/EDA.xlsx')