The goal of this competition is to predict a Windows machine’s probability of getting infected by various families of malware, based on different properties of that machine. The telemetry data containing these properties and the machine infections was generated by combining heartbeat and threat reports collected by Microsoft's endpoint protection solution, Windows Defender.

Each row in this dataset corresponds to a machine, uniquely identified by a MachineIdentifier. HasDetections is the ground truth and indicates that Malware was detected on the machine. Using the information and labels in train.csv, you must predict the value for HasDetections for each machine in test.csv.

The sampling methodology used to create this dataset was designed to meet certain business constraints, both in regards to user privacy as well as the time period during which the machine was running. Malware detection is inherently a time-series problem, but it is made complicated by the introduction of new machines, machines that come online and offline, machines that receive patches, machines that receive new operating systems, etc. While the dataset provided here has been roughly split by time, the complications and sampling requirements mentioned above may mean you may see imperfect agreement between your cross validation, public, and private scores! Additionally, this dataset is not representative of Microsoft customers’ machines in the wild; it has been sampled to include a much larger proportion of malware machines.

In [31]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.experimental import enable_hist_gradient_boosting
import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [32]:
# set up display area to show dataframe in jupyter qtconsole
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [33]:
# We need to explicitly specify data types when reading csv, otherwise it is very memory consuming
# and we will get the warning "Specify dtype option on import or set low_memory=False"
# So, we will manually defined the data types

# P.S. I have loaded the sample data and exported train_data.dtypes
# these are the data types for fast loading

datatypes = {
    'ProductName': str,
    'EngineVersion': str,
    'AppVersion': str,
    'AvSigVersion': str,
    'IsBeta': np.int8,
    'RtpStateBitfield': str,
    'IsSxsPassiveMode': np.int8,
    'DefaultBrowsersIdentifier': str,
    'AVProductStatesIdentifier': str,
    'AVProductsInstalled': str,
    'AVProductsEnabled': str,
    'HasTpm': np.int8,
    'CountryIdentifier': str,
    'CityIdentifier': str,
    'OrganizationIdentifier': str,
    'GeoNameIdentifier': str,
    'LocaleEnglishNameIdentifier': str,
    'Platform': str,
    'Processor': str,
    'OsVer': str,
    'OsBuild': str,
    'OsSuite': str,
    'OsPlatformSubRelease': str,
    'OsBuildLab': str,
    'SkuEdition': str,
    'IsProtected': str,
    'AutoSampleOptIn': np.int8,
    'PuaMode': str,
    'SMode': str,
    'IeVerIdentifier': str,
    'SmartScreen': str,
    'Firewall': str,
    'UacLuaenable': str,
    'Census_MDC2FormFactor': str,
    'Census_DeviceFamily': str,
    'Census_OEMNameIdentifier': str,
    'Census_OEMModelIdentifier': str, 
    'Census_ProcessorCoreCount': str,
    'Census_ProcessorManufacturerIdentifier': str,
    'Census_ProcessorModelIdentifier': str,
    'Census_ProcessorClass': str,
    'Census_PrimaryDiskTotalCapacity': np.float64,
    'Census_PrimaryDiskTypeName': str,
    'Census_SystemVolumeTotalCapacity': np.float64,
    'Census_HasOpticalDiskDrive': np.int8,
    'Census_TotalPhysicalRAM': np.float64,
    'Census_ChassisTypeName': str,
    'Census_InternalPrimaryDiagonalDisplaySizeInInches': str,
    'Census_InternalPrimaryDisplayResolutionHorizontal': str,
    'Census_InternalPrimaryDisplayResolutionVertical': str,
    'Census_PowerPlatformRoleName': str,
    'Census_InternalBatteryType': str,
    'Census_InternalBatteryNumberOfCharges': str,
    'Census_OSVersion': str,
    'Census_OSArchitecture': str,
    'Census_OSBranch': str,
    'Census_OSBuildNumber': str,
    'Census_OSBuildRevision': str,
    'Census_OSEdition': str,
    'Census_OSSkuName': str,
    'Census_OSInstallTypeName': str,
    'Census_OSInstallLanguageIdentifier': str,
    'Census_OSUILocaleIdentifier': str,
    'Census_OSWUAutoUpdateOptionsName': str,
    'Census_IsPortableOperatingSystem': np.int8,
    'Census_GenuineStateName': str,
    'Census_ActivationChannel': str,
    'Census_IsFlightingInternal': str,
    'Census_IsFlightsDisabled': str,
    'Census_FlightRing': str,
    'Census_ThresholdOptIn': str,
    'Census_FirmwareManufacturerIdentifier': str,
    'Census_FirmwareVersionIdentifier': str,
    'Census_IsSecureBootEnabled': np.int8,
    'Census_IsWIMBootEnabled': str,
    'Census_IsVirtualDevice': str,
    'Census_IsTouchEnabled': np.int8,
    'Census_IsPenCapable': np.int8,
    'Census_IsAlwaysOnAlwaysConnectedCapable': str,
    'Wdft_IsGamer': str,
    'Wdft_RegionIdentifier': str,
    'HasDetections': np.int8
}

#full_features = pd.read_csv("./csv/train.csv", dtype=datatypes, index_col="MachineIdentifier")
full_features = pd.read_csv("./csv/train.csv", dtype=datatypes, nrows=200000, index_col="MachineIdentifier")

In [34]:
# Shuffle the data
#np.random.seed(0)

shuffle = np.random.permutation(np.arange(full_features.shape[0]))[:500000]
indexes = full_features.index[shuffle]

full_features = full_features.loc[indexes,:]

In [35]:
full_labels = full_features["HasDetections"]

# Dropping labels ["HasDetections"] from training dataset
full_features = full_features.drop(["HasDetections"], axis=1)

In [36]:
print (full_features.shape)

(200000, 81)


In [37]:
# Checking the columns with the most NULL values
print((full_features.isnull().sum()).sort_values(ascending=False).head(20))

PuaMode                                  199947
Census_ProcessorClass                    199127
DefaultBrowsersIdentifier                190297
Census_IsFlightingInternal               166068
Census_InternalBatteryType               142088
Census_ThresholdOptIn                    127008
Census_IsWIMBootEnabled                  126839
SmartScreen                               71123
OrganizationIdentifier                    61537
SMode                                     11863
CityIdentifier                             7215
Wdft_IsGamer                               6737
Wdft_RegionIdentifier                      6737
Census_InternalBatteryNumberOfCharges      6065
Census_FirmwareManufacturerIdentifier      4101
Census_FirmwareVersionIdentifier           3586
Census_IsFlightsDisabled                   3541
Census_OEMModelIdentifier                  2245
Census_OEMNameIdentifier                   2082
Firewall                                   2078
dtype: int64


In [38]:
full_features['PuaMode'].unique()

array([nan, 'on'], dtype=object)

In [39]:
full_features['Census_IsFlightingInternal'].unique()

array([nan, '0', '1'], dtype=object)

In [40]:
full_features['Census_InternalBatteryType'].unique()

array([nan, 'lion', 'li-i', '#', 'nimh', 'lip', 'liio', 'real', 'li',
       'pbac', 'unkn', 'li p', 'vbox', 'batt', 'lhp0', 'bq20', 'ithi',
       'lipp', 'lgi0', '4cel', 'ram', 'lipo', '\x04lio', 'lit'],
      dtype=object)

In [41]:
full_features['Census_ThresholdOptIn'].unique()

array([nan, '0', '1'], dtype=object)

In [42]:
full_features['Census_IsWIMBootEnabled'].unique()

array([nan, '0'], dtype=object)

In [43]:
full_features['SMode'].unique()

array(['0', nan, '1'], dtype=object)

In [44]:
full_features['OrganizationIdentifier'].unique()

array(['18', '27', nan, '37', '48', '46', '11', '1', '39', '14', '49',
       '5', '50', '32', '19', '52', '36', '33', '10', '40', '2', '28',
       '51', '3', '29', '6', '31', '4', '20', '47', '44', '8', '7', '41',
       '22', '30', '16', '42', '21', '26', '43'], dtype=object)

In [45]:
full_features['Wdft_IsGamer'].unique()

array(['0', '1', nan], dtype=object)

In [46]:
full_features['Wdft_RegionIdentifier'].unique()

array(['1', '10', '11', '7', '15', '3', '2', '13', '5', '12', '4', nan,
       '9', '8', '6', '14'], dtype=object)

In [47]:
full_features['CityIdentifier'].unique()

array(['144987', '165673', '167', ..., '122317', '137964', '22473'],
      dtype=object)

In [48]:
full_features['Census_InternalBatteryNumberOfCharges'].unique()

array(['0', '14', '4294967295', ..., '47596', '9674', '2664'],
      dtype=object)

In [49]:
# Cleaning up some data

# PuaMode - Potentially Unwanted Applications, if NA, then it is disabled. 99% are NA. So, better to drop it
# Census_ProcessorClass - According to the description - "No longer maintained and updated"
# DefaultBrowsersIdentifier - Almost all values are empty. Therefore we will drop this column
# Census_IsFlightingInternal - whether this is internal or "external" testing ring. Column mostly unused. Will have to drop it
# Census_InternalBatteryType - comtains mostly garbage. Besides, it should not be relevant to attack surface.
# Census_ThresholdOptIn - also mostly unused. Googled it and Threshold was used in first versions of Windows 10. Looks like unused now
# Census_IsWIMBootEnabled - Is it possible to boot from Windows Image? Not relevant to identification of the attacks when 70% of data is emtpy
# SmartScreen - Whether smart screen in explorer is enabled. Should be important. "ExistsNotSet" when null, according to the description
# SMode - Quite relevant field. Will be keeping it
# OrganizationIdentifier - Attacks by organizations should be analyzed. If not filled, will assign "0". 
# Census_InternalBatteryNumberOfCharges - Not relevant. Will drop this column in order not to overtrain
# Census_OSSkuName -  OS edition friendly name (currently Windows only). - Can be removed. Duplicate field
# Census_ChassisTypeName - Census_MDC2FormFactor gives better information. Let's remove this field

full_features['PuaMode'] = full_features['PuaMode'].fillna('off')
full_features['SmartScreen'] = full_features['SmartScreen'].fillna('ExistsNotSet')
full_features['SMode'] = full_features['SMode'].fillna('0').astype('int8')
full_features['OrganizationIdentifier'] = full_features['OrganizationIdentifier'].fillna('0').astype('int32')
full_features['Wdft_IsGamer'] = full_features['Wdft_IsGamer'].fillna('0').astype('int8')
full_features['Wdft_RegionIdentifier'] = full_features['Wdft_RegionIdentifier'].fillna('0').astype('int32')
full_features['CityIdentifier'] = full_features['CityIdentifier'].fillna('0').astype('int32')

full_features = full_features.drop([
    'PuaMode',
    'Census_OSEdition',
    'Census_ProcessorClass',
    'DefaultBrowsersIdentifier',
    'Census_IsFlightingInternal',
    'Census_InternalBatteryType'], axis=1)

In [50]:
# Now let us check the string columns

string_columns = []

for colname in full_features.dtypes.keys():
    if full_features[colname].dtypes.name == "object":
        string_columns.append(colname)
        
string_columns

['ProductName',
 'EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'RtpStateBitfield',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'AVProductsEnabled',
 'CountryIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'Platform',
 'Processor',
 'OsVer',
 'OsBuild',
 'OsSuite',
 'OsPlatformSubRelease',
 'OsBuildLab',
 'SkuEdition',
 'IsProtected',
 'IeVerIdentifier',
 'SmartScreen',
 'Firewall',
 'UacLuaenable',
 'Census_MDC2FormFactor',
 'Census_DeviceFamily',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorCoreCount',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_PrimaryDiskTypeName',
 'Census_ChassisTypeName',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
 'Census_InternalPrimaryDisplayResolutionHorizontal',
 'Census_InternalPrimaryDisplayResolutionVertical',
 'Census_PowerPlatformRoleName',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSVersion',
 'Census_OSArchitecture',
 'Cen

In [51]:
full_features[string_columns].head(10)

Unnamed: 0_level_0,ProductName,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,CountryIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsVer,OsBuild,OsSuite,OsPlatformSubRelease,OsBuildLab,SkuEdition,IsProtected,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTypeName,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSSkuName,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsAlwaysOnAlwaysConnectedCapable
MachineIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
00a3689ca72f3143d5581bde26274795,win8defender,1.1.15200.1,4.18.1807.18075,1.275.720.0,7,53447,1,1,214,277,75,windows10,x64,10.0.0.0,17134,256,rs4,17134.1.amd64fre.rs4_release.180410-1804,Pro,1,,ExistsNotSet,1,1,Notebook,Windows.Desktop,1443,256463,4,5,2697,HDD,Laptop,14.0,1366,768,Mobile,0,10.0.17134.228,amd64,rs4_release,17134,228,PROFESSIONAL,Reset,8,31,FullAuto,IS_GENUINE,Volume:GVLK,0,Retail,,355,2123,,0,0
04ea4e6d5476dcb1842b0476c3c3ccad,win8defender,1.1.12902.0,4.9.10586.1106,1.225.2482.0,7,53447,1,1,141,167,227,windows10,x64,10.0.0.0,10586,768,th2,10586.1176.amd64fre.th2_release_sec.170913-1848,Home,1,74.0,ExistsNotSet,1,1,Notebook,Windows.Desktop,525,331196,2,5,1992,HDD,Notebook,15.5,1366,768,Mobile,14,10.0.10586.1176,amd64,th2_release_sec,10586,1176,CORE_SINGLELANGUAGE,Update,9,34,Notify,IS_GENUINE,OEM:DM,0,Retail,0.0,142,69812,0.0,0,0
04e3249c245991c6c7c7e08c435a4a05,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1826.0,7,23657,2,1,100,277,75,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1,137.0,ExistsNotSet,1,1,Notebook,Windows.Desktop,1443,256682,4,5,3026,HDD,Laptop,15.5,1366,768,Mobile,0,10.0.17134.165,amd64,rs4_release,17134,165,CORE,UUPUpgrade,8,31,FullAuto,IS_GENUINE,OEM:DM,0,Retail,,355,4348,,0,0
049b0b1c03547d39a816143e7873b40e,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1420.0,5,53447,1,1,43,53,42,windows10,x64,10.0.0.0,15063,768,rs2,15063.0.amd64fre.rs2_release.170317-1834,Home,1,108.0,RequireAdmin,1,1,Desktop,Windows.Desktop,2102,111446,4,5,3396,HDD,Desktop,23.0,1920,1080,Desktop,4294967295,10.0.15063.1206,amd64,rs2_release,15063,1206,CORE_COUNTRYSPECIFIC,Upgrade,37,158,Notify,IS_GENUINE,OEM:DM,0,Retail,0.0,93,19805,0.0,0,0
00a2a84731342e964ef33cc57d6a2c5f,win8defender,1.1.15100.1,4.18.1806.18062,1.273.511.0,7,32113,2,1,44,57,218,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1,137.0,ExistsNotSet,1,1,AllInOne,Windows.Desktop,585,189292,4,5,2393,HDD,AllinOne,21.5,1920,1080,Desktop,4294967295,10.0.17134.165,amd64,rs4_release,17134,165,CORE_SINGLELANGUAGE,Refresh,9,34,FullAuto,IS_GENUINE,OEM:DM,0,Retail,,142,56688,,0,0
01562540a99c95e7ebea9dd7c25c2273,win8defender,1.1.15100.1,4.18.1807.18075,1.273.1589.0,7,53447,1,1,188,102,75,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1,137.0,ExistsNotSet,1,1,Notebook,Windows.Desktop,2102,241812,2,1,313,HDD,Notebook,13.9,1366,768,Mobile,268,10.0.17134.165,amd64,rs4_release,17134,165,CORE_SINGLELANGUAGE,UUPUpgrade,8,31,FullAuto,IS_GENUINE,OEM:DM,0,Retail,,142,33108,,0,0
01c9f1c5db88dc9ec5ac9b3e97b4c3d0,win8defender,1.1.15200.1,4.18.1807.18075,1.275.1332.0,7,53447,1,1,41,52,75,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1,137.0,ExistsNotSet,1,1,Convertible,Windows.Desktop,2206,244535,4,5,3392,HDD,Notebook,11.6,1366,768,Mobile,0,10.0.17134.285,amd64,rs4_release,17134,285,CORE,IBSClean,8,31,FullAuto,IS_GENUINE,Retail,0,Retail,,554,33041,,0,0
04f2266f5a4ef87e167768a3bbbdf8e4,win8defender,1.1.15100.1,4.11.15063.1155,1.273.841.0,7,7945,2,1,164,205,172,windows10,x86,10.0.0.0,15063,768,rs2,15063.0.x86fre.rs2_release.170317-1834,Home,1,108.0,RequireAdmin,1,1,PCOther,Windows.Desktop,2427,190554,4,5,1848,SSD,Notebook,13.9,1366,768,Slate,17,10.0.15063.1155,x86,rs2_release,15063,1155,CORE,Update,27,120,Notify,IS_GENUINE,OEM:DM,0,Retail,,513,23642,,0,1
01ed71be4779796e2883ce5f92640d38,win8defender,1.1.15000.2,4.18.1806.18062,1.271.105.0,7,53447,1,1,201,267,251,windows10,x86,10.0.0.0,17134,256,rs4,17134.1.x86fre.rs4_release.180410-1804,Pro,1,137.0,RequireAdmin,1,1,Notebook,Windows.Desktop,4142,298069,2,1,640,Unspecified,Laptop,14.0,1366,768,Mobile,0,10.0.17134.112,x86,rs4_release,17134,112,PROFESSIONAL,UUPUpgrade,35,148,Notify,IS_GENUINE,Retail,0,Retail,0.0,142,3031,0.0,0,0
02c3bac7094c4b29260bc8287550373b,win8defender,1.1.14600.4,4.13.17134.1,1.263.48.0,7,53447,1,1,171,211,182,windows10,x64,10.0.0.0,17134,768,rs4,17134.1.amd64fre.rs4_release.180410-1804,Home,1,137.0,ExistsNotSet,1,1,Notebook,Windows.Desktop,585,189448,4,5,2574,HDD,Notebook,15.5,1024,768,Mobile,0,10.0.17134.1,amd64,rs4_release,17134,1,CORE_SINGLELANGUAGE,IBSClean,29,125,UNKNOWN,OFFLINE,Retail,0,NOT_SET,,556,63086,,0,0


At first glance at the data, it becomes obvious, that the stings are either classifiers, or versions that contain 4 classifiers in them. So. in order to use the algorithms that support only numeric values we will convert classifiers like "ProductName" to integer range and the fields like AppVersion

In [52]:
def df_replacevalues(df, colname, oldvalues, newvalues):
    # First, we need to get the most frequent value of the column
    topvalue = df[colname].value_counts().idxmax()
    
    # Replace NaN values with the popular value
    df[colname].fillna(topvalue, inplace=True)
    
    # We need to make sure no other value than oldvalues exists
    indexes = df[~df[colname].isin(oldvalues)].index
    
    # If the "Garbage" values are more than 1%, then raise an error
    if len(indexes) > len(df) / 100:
        raise Exception("Not all neccessary values are present in oldvalues array")
    
    # Replace "Garbage" with the top value
    df.loc[indexes,[colname]] = topvalue
    
    print ("Previous values", df[colname].unique())
    df[colname] = pd.to_numeric(df[colname].replace(oldvalues, newvalues), errors='raise', downcast='integer')
    print ("New values", df[colname].unique())
    
#full_features["Platform"].unique()
#full_features["Platform"].value_counts()
#full_features[~full_features["ProductName"].isin(['win8defender', 'mse'])].index

In [53]:
colname = "ProductName"
oldvalues = ['win8defender','mse','mseprerelease','windowsintune','fep','scep']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['win8defender' 'mse']
New values [1 2]


In [54]:
colname = "Platform"
oldvalues = ['windows10','windows7','windows8','windows2016']
newvalues = [10,7,8,2016]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['windows10' 'windows8' 'windows7' 'windows2016']
New values [  10    8    7 2016]


In [55]:
colname = "Processor"
oldvalues = ['x64','arm64','x86']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['x64' 'x86' 'arm64']
New values [1 3 2]


In [56]:
colname = "OsPlatformSubRelease"
oldvalues = ['rs4','rs1','rs3','windows7','windows8.1','th1','rs2','th2','prers5']
newvalues = [504,501,503,507,508,201,502,202,405]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['rs4' 'th2' 'rs2' 'rs3' 'windows8.1' 'windows7' 'prers5' 'th1' 'rs1']
New values [504 202 502 503 508 507 405 201 501]


In [57]:
colname = "SkuEdition"
oldvalues = ['Pro','Home','Invalid','Enterprise LTSB','Enterprise','Education','Cloud','Server']
newvalues = [55,52,0,71,70,20,90,80]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Pro' 'Home' 'Invalid' 'Education' 'Enterprise LTSB' 'Enterprise'
 'Server' 'Cloud']
New values [55 52  0 20 71 70 80 90]


In [58]:
colname = "SmartScreen"
oldvalues = ['Off','off','OFF','On','on','Warn','Prompt','ExistsNotSet','Block','RequireAdmin']
newvalues = [0,0,0,1,1,2,3,4,5,6]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['ExistsNotSet' 'RequireAdmin' 'Prompt' 'Warn' 'Block' 'Off' 'on' 'On'
 'off']
New values [4 6 3 2 5 0 1]


In [59]:
colname = "Census_MDC2FormFactor"
oldvalues = ['Desktop','Notebook','Detachable','PCOther','AllInOne','Convertible','SmallTablet','LargeTablet','SmallServer','LargeServer','MediumServer','ServerOther','IoTOther']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Notebook' 'Desktop' 'AllInOne' 'Convertible' 'PCOther' 'Detachable'
 'MediumServer' 'SmallTablet' 'LargeTablet' 'SmallServer' 'LargeServer'
 'ServerOther']
New values [ 2  1  5  6  4  3 11  7  8  9 10 12]


In [60]:
# Census_DeviceFamily ['Windows.Desktop' 'Windows.Server' 'Windows']

colname = "Census_DeviceFamily"
oldvalues = ['Windows.Desktop','Windows.Server','Windows']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Windows.Desktop' 'Windows.Server']
New values [1 2]


In [61]:
# Census_PrimaryDiskTypeName ['HDD' 'SSD' 'UNKNOWN' 'Unspecified' nan]

colname = "Census_PrimaryDiskTypeName"
oldvalues = ['HDD','SSD','UNKNOWN','Unspecified']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['HDD' 'SSD' 'Unspecified' 'UNKNOWN']
New values [1 2 4 3]


In [62]:
# Census_ChassisTypeName Index(['Notebook', 'Desktop', 'Laptop', 'Portable', 'AllinOne', 'MiniTower', 'Convertible', 'Other', 'UNKNOWN', 'Detachable', 'LowProfileDesktop', 'HandHeld', 'SpaceSaving', 'Tablet', 'Tower', 'Unknown', 'MainServerChassis', 'MiniPC', 'LunchBox', 'RackMountChassis', 'SubNotebook', 'BusExpansionChassis', '30', 'StickPC', '0', 'MultisystemChassis', 'Blade', '35', 'PizzaBox', 'SealedCasePC', 'SubChassis', 'ExpansionChassis', '31', '32', '88', '127', '25', '44', '36', 'DockingStation', 'BladeEnclosure', 'CompactPCI', '81', '45', 'EmbeddedPC', '28', '82', '112', 'IoTGateway', '49', '76', '39'], dtype='object')

colname = "Census_ChassisTypeName"
oldvalues = ['Notebook', 'Desktop', 'Laptop', 'Portable', 'AllinOne', 'MiniTower', 'Convertible', 'Other', 'UNKNOWN', 'Detachable', 
             'LowProfileDesktop', 'HandHeld', 'SpaceSaving', 'Tablet', 'Tower', 'Unknown', 'MainServerChassis', 'MiniPC', 'LunchBox', 
             'RackMountChassis', 'SubNotebook', 'BusExpansionChassis']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Laptop' 'Notebook' 'Desktop' 'AllinOne' 'Portable' 'Convertible' 'Tower'
 'MainServerChassis' 'Other' 'SpaceSaving' 'LowProfileDesktop' 'MiniTower'
 'UNKNOWN' 'Detachable' 'HandHeld' 'Tablet' 'Unknown' 'LunchBox' 'MiniPC'
 'RackMountChassis' 'BusExpansionChassis' 'SubNotebook']
New values [ 3  1  2  5  4  7 15 17  8 13 11  6  9 10 12 14 16 19 18 20 22 21]


In [63]:
# Census_PowerPlatformRoleName Index(['Mobile', 'Desktop', 'Slate', 'Workstation', 'SOHOServer', 'UNKNOWN', 'EnterpriseServer', 'AppliancePC', 'PerformanceServer', 'Unspecified']

colname = "Census_PowerPlatformRoleName"
full_features[colname] = full_features[colname].fillna('UNKNOWN')
oldvalues = ['Mobile', 'Desktop', 'Slate', 'Workstation', 'SOHOServer', 'UNKNOWN', 'EnterpriseServer', 'AppliancePC', 'PerformanceServer', 'Unspecified']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Mobile' 'Desktop' 'Slate' 'Workstation' 'SOHOServer' 'AppliancePC'
 'UNKNOWN' 'EnterpriseServer' 'PerformanceServer']
New values [1 2 3 4 5 8 6 7 9]


In [64]:
# Census_OSArchitecture Index(['amd64', 'x86', 'arm64'], dtype='object')

colname = "Census_OSArchitecture"
oldvalues = ['amd64', 'x86', 'arm64']
newvalues = [1,3,2]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['amd64' 'x86' 'arm64']
New values [1 3 2]


In [65]:
# Census_OSBranch Index(['rs4_release', 'rs3_release', 'rs3_release_svc_escrow', 'rs2_release', 'rs1_release', 'th2_release', 'th2_release_sec', 'th1_st1', 'th1', 'rs5_release', 'rs3_release_svc_escrow_im', 'rs_prerelease', 'rs_prerelease_flt', 'rs5_release_sigma', 'rs1_release_srvmedia', 'winblue_ltsb_escrow', 'win7sp1_ldr', 'winblue_ltsb', 'win8_gdr', 'rs_xbox', 'rs5_release_edge', 'rs5_release_sigma_dev', 'win7sp1_ldr_escrow', 'rs1_release_sec', 'rs_shell', 'rs1_release_svc', 'win8_ldr', 'rs_onecore_base_cobalt', 'rs_onecore_stack_per1', 'rs5_release_sign', 'rs3_release_svc', 'Khmer OS'], dtype='object')

colname = "Census_OSBranch"
oldvalues = ['rs4_release', 'rs3_release', 'rs3_release_svc_escrow', 'rs2_release', 'rs1_release', 'th2_release', 'th2_release_sec', 'th1_st1', 'th1', 'rs5_release', 'rs3_release_svc_escrow_im', 'rs_prerelease', 'rs_prerelease_flt', 'rs5_release_sigma']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['rs4_release' 'th2_release_sec' 'rs2_release' 'rs3_release' 'th2_release'
 'rs3_release_svc_escrow' 'rs5_release' 'th1_st1' 'rs1_release' 'th1'
 'rs3_release_svc_escrow_im' 'rs5_release_sigma' 'rs_prerelease'
 'rs_prerelease_flt']
New values [ 1  7  4  2  6  3 10  8  5  9 11 14 12 13]


In [66]:
# Census_OSSkuName Index(['CORE', 'PROFESSIONAL', 'CORE_SINGLELANGUAGE', 'CORE_COUNTRYSPECIFIC', 'EDUCATION', 'ENTERPRISE', 'PROFESSIONAL_N', 'ENTERPRISE_S', 'STANDARD_SERVER', 'CLOUD', 'CORE_N', 'STANDARD_EVALUATION_SERVER', 'EDUCATION_N', 'ENTERPRISE_S_N', 'DATACENTER_EVALUATION_SERVER', 'SB_SOLUTION_SERVER', 'ENTERPRISE_N', 'PRO_WORKSTATION', 'UNLICENSED', 'DATACENTER_SERVER', 'PRO_WORKSTATION_N', 'CLOUDN', 'PRO_CHINA', 'SERVERRDSH', 'ULTIMATE', 'PRO_FOR_EDUCATION', 'PRO_SINGLE_LANGUAGE', 'UNDEFINED', 'STARTER', 'ENTERPRISEG'], dtype='object')

colname = "Census_OSSkuName"
oldvalues = ['CORE', 'CORE_SINGLELANGUAGE', 'CORE_COUNTRYSPECIFIC', 'CORE_N',
             'EDUCATION', 'EDUCATION_N',
             'PROFESSIONAL', 'PROFESSIONAL_N', 'PRO_WORKSTATION',
             'ENTERPRISE',  'ENTERPRISE_S', 'ENTERPRISE_S_N', 'ENTERPRISE_N', 
             'CLOUD',
             'SB_SOLUTION_SERVER', 'STANDARD_SERVER', 'STANDARD_EVALUATION_SERVER', 'DATACENTER_EVALUATION_SERVER', 'UNLICENSED']
newvalues = [i+1 for i in range(len(oldvalues))]

# Group this feature by values

full_features['CORE'] = 1 if 'CORE' in full_features['Census_OSSkuName'] else 0
full_features['EDUCATION'] = 1 if 'EDUCATION' in full_features['Census_OSSkuName'] else 0
full_features['PRO'] = 1 if 'PRO' in full_features['Census_OSSkuName'] else 0
full_features['ENTERPRISE'] = 1 if 'ENTERPRISE' in full_features['Census_OSSkuName'] else 0
full_features['CLOUD'] = 1 if 'CLOUD' in full_features['Census_OSSkuName'] else 0
full_features['SERVER'] = 1 if 'SERVER' in full_features['Census_OSSkuName'] else 0
full_features['EVALUATION'] = 1 if 'EVALUATION' in full_features['Census_OSSkuName'] else 0

full_features.drop([colname], axis=1, inplace=True)


In [67]:
# Census_OSInstallTypeName Index(['UUPUpgrade', 'IBSClean', 'Update', 'Upgrade', 'Other', 'Reset', 'Refresh', 'Clean', 'CleanPCRefresh'], dtype='object')

colname = "Census_OSInstallTypeName"
oldvalues = ['UUPUpgrade', 'IBSClean', 'Update', 'Upgrade', 'Other', 'Reset', 'Refresh', 'Clean', 'CleanPCRefresh']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Reset' 'Update' 'UUPUpgrade' 'Upgrade' 'Refresh' 'IBSClean'
 'CleanPCRefresh' 'Other' 'Clean']
New values [6 3 1 4 7 2 9 5 8]


In [68]:
# Census_OSWUAutoUpdateOptionsName Index(['FullAuto', 'UNKNOWN', 'Notify', 'AutoInstallAndRebootAtMaintenanceTime', 'Off', 'DownloadNotify'], dtype='object')

colname = "Census_OSWUAutoUpdateOptionsName"
oldvalues = ['FullAuto', 'UNKNOWN', 'Notify', 'AutoInstallAndRebootAtMaintenanceTime', 'Off', 'DownloadNotify']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['FullAuto' 'Notify' 'UNKNOWN' 'DownloadNotify'
 'AutoInstallAndRebootAtMaintenanceTime' 'Off']
New values [1 3 2 6 4 5]


In [69]:
# Census_GenuineStateName Index(['IS_GENUINE', 'INVALID_LICENSE', 'OFFLINE', 'UNKNOWN', 'TAMPERED'], dtype='object')

colname = "Census_GenuineStateName"
oldvalues = ['IS_GENUINE', 'INVALID_LICENSE', 'OFFLINE', 'UNKNOWN', 'TAMPERED']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['IS_GENUINE' 'OFFLINE' 'INVALID_LICENSE' 'UNKNOWN']
New values [1 3 2 4]


In [70]:
# Census_ActivationChannel Index(['Retail', 'OEM:DM', 'Volume:GVLK', 'OEM:NONSLP', 'Volume:MAK', 'Retail:TB:Eval'], dtype='object')

colname = "Census_ActivationChannel"
oldvalues = ['Retail', 'OEM:DM', 'Volume:GVLK', 'OEM:NONSLP', 'Volume:MAK', 'Retail:TB:Eval']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Volume:GVLK' 'OEM:DM' 'Retail' 'OEM:NONSLP' 'Volume:MAK'
 'Retail:TB:Eval']
New values [3 2 1 4 5 6]


In [71]:
# Census_FlightRing Index(['Retail', 'NOT_SET', 'Unknown', 'WIS', 'WIF', 'RP', 'Disabled', 'OSG', 'Canary', 'Invalid', 'CBCanary'], dtype='object')

colname = "Census_FlightRing"
oldvalues = ['Retail', 'NOT_SET', 'Unknown', 'WIS', 'WIF', 'RP', 'Disabled', 'OSG', 'Canary', 'Invalid', 'CBCanary']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['Retail' 'NOT_SET' 'Unknown' 'WIF' 'Disabled' 'RP' 'WIS']
New values [1 2 3 5 7 6 4]


In [72]:
# PuaMode Index(['off', 'on', 'audit'], dtype='object')

#colname = "PuaMode"
#oldvalues = ['off', 'on', 'audit']
#newvalues = [0,1,2]

#df_replacevalues(full_features, colname, oldvalues, newvalues)

In [73]:
# Now let us check the string columns again

string_columns = []

for colname in full_features.dtypes.keys():
    if full_features[colname].dtypes.name == "object":
        string_columns.append(colname)
        
string_columns

['EngineVersion',
 'AppVersion',
 'AvSigVersion',
 'RtpStateBitfield',
 'AVProductStatesIdentifier',
 'AVProductsInstalled',
 'AVProductsEnabled',
 'CountryIdentifier',
 'GeoNameIdentifier',
 'LocaleEnglishNameIdentifier',
 'OsVer',
 'OsBuild',
 'OsSuite',
 'OsBuildLab',
 'IsProtected',
 'IeVerIdentifier',
 'Firewall',
 'UacLuaenable',
 'Census_OEMNameIdentifier',
 'Census_OEMModelIdentifier',
 'Census_ProcessorCoreCount',
 'Census_ProcessorManufacturerIdentifier',
 'Census_ProcessorModelIdentifier',
 'Census_InternalPrimaryDiagonalDisplaySizeInInches',
 'Census_InternalPrimaryDisplayResolutionHorizontal',
 'Census_InternalPrimaryDisplayResolutionVertical',
 'Census_InternalBatteryNumberOfCharges',
 'Census_OSVersion',
 'Census_OSBuildNumber',
 'Census_OSBuildRevision',
 'Census_OSInstallLanguageIdentifier',
 'Census_OSUILocaleIdentifier',
 'Census_IsFlightsDisabled',
 'Census_ThresholdOptIn',
 'Census_FirmwareManufacturerIdentifier',
 'Census_FirmwareVersionIdentifier',
 'Census_IsWIM

In [74]:
full_features[string_columns].head(10)

Unnamed: 0_level_0,EngineVersion,AppVersion,AvSigVersion,RtpStateBitfield,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,CountryIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,OsVer,OsBuild,OsSuite,OsBuildLab,IsProtected,IeVerIdentifier,Firewall,UacLuaenable,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_InternalBatteryNumberOfCharges,Census_OSVersion,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_IsFlightsDisabled,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsAlwaysOnAlwaysConnectedCapable
MachineIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
00a3689ca72f3143d5581bde26274795,1.1.15200.1,4.18.1807.18075,1.275.720.0,7,53447,1,1,214,277,75,10.0.0.0,17134,256,17134.1.amd64fre.rs4_release.180410-1804,1,,1,1,1443,256463,4,5,2697,14.0,1366,768,0,10.0.17134.228,17134,228,8,31,0,,355,2123,,0,0
04ea4e6d5476dcb1842b0476c3c3ccad,1.1.12902.0,4.9.10586.1106,1.225.2482.0,7,53447,1,1,141,167,227,10.0.0.0,10586,768,10586.1176.amd64fre.th2_release_sec.170913-1848,1,74.0,1,1,525,331196,2,5,1992,15.5,1366,768,14,10.0.10586.1176,10586,1176,9,34,0,0.0,142,69812,0.0,0,0
04e3249c245991c6c7c7e08c435a4a05,1.1.15100.1,4.18.1807.18075,1.273.1826.0,7,23657,2,1,100,277,75,10.0.0.0,17134,768,17134.1.amd64fre.rs4_release.180410-1804,1,137.0,1,1,1443,256682,4,5,3026,15.5,1366,768,0,10.0.17134.165,17134,165,8,31,0,,355,4348,,0,0
049b0b1c03547d39a816143e7873b40e,1.1.15100.1,4.18.1807.18075,1.273.1420.0,5,53447,1,1,43,53,42,10.0.0.0,15063,768,15063.0.amd64fre.rs2_release.170317-1834,1,108.0,1,1,2102,111446,4,5,3396,23.0,1920,1080,4294967295,10.0.15063.1206,15063,1206,37,158,0,0.0,93,19805,0.0,0,0
00a2a84731342e964ef33cc57d6a2c5f,1.1.15100.1,4.18.1806.18062,1.273.511.0,7,32113,2,1,44,57,218,10.0.0.0,17134,768,17134.1.amd64fre.rs4_release.180410-1804,1,137.0,1,1,585,189292,4,5,2393,21.5,1920,1080,4294967295,10.0.17134.165,17134,165,9,34,0,,142,56688,,0,0
01562540a99c95e7ebea9dd7c25c2273,1.1.15100.1,4.18.1807.18075,1.273.1589.0,7,53447,1,1,188,102,75,10.0.0.0,17134,768,17134.1.amd64fre.rs4_release.180410-1804,1,137.0,1,1,2102,241812,2,1,313,13.9,1366,768,268,10.0.17134.165,17134,165,8,31,0,,142,33108,,0,0
01c9f1c5db88dc9ec5ac9b3e97b4c3d0,1.1.15200.1,4.18.1807.18075,1.275.1332.0,7,53447,1,1,41,52,75,10.0.0.0,17134,768,17134.1.amd64fre.rs4_release.180410-1804,1,137.0,1,1,2206,244535,4,5,3392,11.6,1366,768,0,10.0.17134.285,17134,285,8,31,0,,554,33041,,0,0
04f2266f5a4ef87e167768a3bbbdf8e4,1.1.15100.1,4.11.15063.1155,1.273.841.0,7,7945,2,1,164,205,172,10.0.0.0,15063,768,15063.0.x86fre.rs2_release.170317-1834,1,108.0,1,1,2427,190554,4,5,1848,13.9,1366,768,17,10.0.15063.1155,15063,1155,27,120,0,,513,23642,,0,1
01ed71be4779796e2883ce5f92640d38,1.1.15000.2,4.18.1806.18062,1.271.105.0,7,53447,1,1,201,267,251,10.0.0.0,17134,256,17134.1.x86fre.rs4_release.180410-1804,1,137.0,1,1,4142,298069,2,1,640,14.0,1366,768,0,10.0.17134.112,17134,112,35,148,0,0.0,142,3031,0.0,0,0
02c3bac7094c4b29260bc8287550373b,1.1.14600.4,4.13.17134.1,1.263.48.0,7,53447,1,1,171,211,182,10.0.0.0,17134,768,17134.1.amd64fre.rs4_release.180410-1804,1,137.0,1,1,585,189448,4,5,2574,15.5,1024,768,0,10.0.17134.1,17134,1,29,125,0,,556,63086,,0,0


In [75]:
# Now we need to process the columns that contain version numbers
# We will split them in 4-5 different columns

versions = ['EngineVersion','AppVersion','AvSigVersion','OsVer','OsBuildLab','Census_OSVersion']
newcolumnnames = []

for colname in versions:
    data = full_features[colname].str.split(r"\.|-",expand=True) # Split if '.' or '-'
    for i in range(data.shape[1]):
        newcolumnname = "%s_%d" % (colname, i+1)
        newcolumnnames.append(newcolumnname)
        full_features[newcolumnname] = data[i]

In [76]:
full_features[newcolumnnames].head(10)

Unnamed: 0_level_0,EngineVersion_1,EngineVersion_2,EngineVersion_3,EngineVersion_4,AppVersion_1,AppVersion_2,AppVersion_3,AppVersion_4,AvSigVersion_1,AvSigVersion_2,AvSigVersion_3,AvSigVersion_4,OsVer_1,OsVer_2,OsVer_3,OsVer_4,OsBuildLab_1,OsBuildLab_2,OsBuildLab_3,OsBuildLab_4,OsBuildLab_5,OsBuildLab_6,Census_OSVersion_1,Census_OSVersion_2,Census_OSVersion_3,Census_OSVersion_4
MachineIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
00a3689ca72f3143d5581bde26274795,1,1,15200,1,4,18,1807,18075,1,275,720,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,228
04ea4e6d5476dcb1842b0476c3c3ccad,1,1,12902,0,4,9,10586,1106,1,225,2482,0,10,0,0,0,10586,1176,amd64fre,th2_release_sec,170913,1848,10,0,10586,1176
04e3249c245991c6c7c7e08c435a4a05,1,1,15100,1,4,18,1807,18075,1,273,1826,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,165
049b0b1c03547d39a816143e7873b40e,1,1,15100,1,4,18,1807,18075,1,273,1420,0,10,0,0,0,15063,0,amd64fre,rs2_release,170317,1834,10,0,15063,1206
00a2a84731342e964ef33cc57d6a2c5f,1,1,15100,1,4,18,1806,18062,1,273,511,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,165
01562540a99c95e7ebea9dd7c25c2273,1,1,15100,1,4,18,1807,18075,1,273,1589,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,165
01c9f1c5db88dc9ec5ac9b3e97b4c3d0,1,1,15200,1,4,18,1807,18075,1,275,1332,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,285
04f2266f5a4ef87e167768a3bbbdf8e4,1,1,15100,1,4,11,15063,1155,1,273,841,0,10,0,0,0,15063,0,x86fre,rs2_release,170317,1834,10,0,15063,1155
01ed71be4779796e2883ce5f92640d38,1,1,15000,2,4,18,1806,18062,1,271,105,0,10,0,0,0,17134,1,x86fre,rs4_release,180410,1804,10,0,17134,112
02c3bac7094c4b29260bc8287550373b,1,1,14600,4,4,13,17134,1,1,263,48,0,10,0,0,0,17134,1,amd64fre,rs4_release,180410,1804,10,0,17134,1


In [77]:
#colname = "OsBuildLab_4"
#print (full_features[colname].value_counts())
#print (colname, full_features[colname].value_counts().keys())

In [78]:
# After splitting the columns, the only values we need to remap are OsBuildLab_3 and OsBuildLab_4
# Other values are already numeric

# OsBuildLab_3 Index(['amd64fre', 'x86fre', 'arm64fre'], dtype='object')

colname = "OsBuildLab_3"
oldvalues = ['amd64fre', 'x86fre', 'arm64fre']
newvalues = [1,3,2]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['amd64fre' 'x86fre' 'arm64fre']
New values [1 3 2]


In [79]:
# OsBuildLab_4 Index(['rs4_release', 'rs3_release_svc_escrow', 'rs3_release', 'rs2_release', 'rs1_release', 'th2_release_sec', 'th1', 'winblue_ltsb_escrow', 'th2_release', 'rs1_release_inmarket', 'winblue_ltsb', 'win7sp1_ldr', 'rs3_release_svc', 'rs1_release_1', 'win7sp1_ldr_escrow', 'rs1_release_sec', 'th1_st1', 'rs5_release', 'rs1_release_inmarket_aim', 'rs3_release_svc_escrow_im', 'th2_release_inmarket', 'rs_prerelease', 'rs_prerelease_flt', 'win7sp1_gdr', 'winblue_gdr', 'th1_escrow', 'win7_gdr', 'winblue_r4', 'rs1_release_inmarket_rim', 'rs1_release_d', 'winblue_r9', 'winblue_r5', 'win7_rtm', 'win7sp1_rtm', 'winblue_r7', 'winblue_r3', 'winblue_r8', 'rs5_release_sigma', 'win7_ldr', 'rs5_release_sigma_dev', 'rs_xbox', 'rs5_release_edge', 'winblue_rtm', 'win7sp1_rc', 'rs3_release_svc_sec', 'rs_onecore_base_cobalt', 'rs6_prerelease', 'rs_onecore_sigma_grfx_dev', 'rs_onecore_stack_per1', 'rs5_release_sign', 'rs_shell']

colname = "OsBuildLab_4"
oldvalues = ['rs4_release', 'rs3_release_svc_escrow', 'rs3_release', 'rs2_release', 'rs1_release', 'th2_release_sec', 'th1', 'winblue_ltsb_escrow', 'th2_release', 'rs1_release_inmarket', 'winblue_ltsb', 'win7sp1_ldr', 'rs3_release_svc', 'rs1_release_1', 'win7sp1_ldr_escrow', 'rs1_release_sec', 'th1_st1', 'rs5_release', 'rs1_release_inmarket_aim', 'rs3_release_svc_escrow_im', 'th2_release_inmarket', 'rs_prerelease', 'rs_prerelease_flt', 'win7sp1_gdr', 'winblue_gdr', 'th1_escrow', 'win7_gdr', 'winblue_r4', 'rs1_release_inmarket_rim', 'rs1_release_d', 'winblue_r9', 'winblue_r5', 'win7_rtm', 'win7sp1_rtm', 'winblue_r7', 'winblue_r3', 'winblue_r8', 'rs5_release_sigma', 'win7_ldr', 'rs5_release_sigma_dev', 'rs_xbox', 'rs5_release_edge', 'winblue_rtm', 'win7sp1_rc', 'rs3_release_svc_sec', 'rs_onecore_base_cobalt', 'rs6_prerelease', 'rs_onecore_sigma_grfx_dev', 'rs_onecore_stack_per1', 'rs5_release_sign', 'rs_shell']
newvalues = [i+1 for i in range(len(oldvalues))]

df_replacevalues(full_features, colname, oldvalues, newvalues)

Previous values ['rs4_release' 'th2_release_sec' 'rs2_release' 'rs3_release'
 'winblue_ltsb_escrow' 'win7sp1_ldr' 'rs3_release_svc_escrow'
 'th2_release' 'rs5_release' 'th1' 'win7sp1_ldr_escrow' 'rs3_release_svc'
 'winblue_ltsb' 'rs1_release' 'th1_st1' 'rs1_release_inmarket'
 'rs1_release_1' 'rs1_release_sec' 'rs3_release_svc_escrow_im'
 'win7sp1_gdr' 'rs5_release_sigma' 'th1_escrow' 'rs1_release_inmarket_aim'
 'rs_prerelease' 'th2_release_inmarket' 'winblue_gdr'
 'rs1_release_inmarket_rim' 'rs_prerelease_flt' 'winblue_r4' 'win7sp1_rtm'
 'rs5_release_edge' 'rs1_release_d' 'win7_gdr' 'winblue_r7' 'winblue_r5'
 'winblue_r9' 'winblue_r8' 'winblue_r3' 'win7_ldr' 'win7_rtm']
New values [ 1  6  4  3  8 12  2  9 18  7 15 13 11  5 17 10 14 16 20 24 38 26 19 22
 21 25 29 23 28 34 42 30 27 35 32 31 37 36 39 33]


In [80]:
versions = ['EngineVersion','AppVersion','AvSigVersion','OsVer','OsBuildLab','Census_OSVersion']

full_features = full_features.drop(versions, axis=1)

In [81]:
for colname in full_features.columns:
    if full_features[colname].dtypes.name not in ["int8","int16","int32"]:
        full_features[colname] = pd.to_numeric(full_features[colname])
        topvalue = full_features[colname].value_counts().idxmax()
        full_features[colname].fillna(topvalue, inplace=True)

In [82]:
full_features.head(10)

Unnamed: 0_level_0,ProductName,IsBeta,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsBuild,OsSuite,OsPlatformSubRelease,SkuEdition,IsProtected,AutoSampleOptIn,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryNumberOfCharges,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,CORE,EDUCATION,PRO,ENTERPRISE,CLOUD,SERVER,EVALUATION,EngineVersion_1,EngineVersion_2,EngineVersion_3,EngineVersion_4,AppVersion_1,AppVersion_2,AppVersion_3,AppVersion_4,AvSigVersion_1,AvSigVersion_2,AvSigVersion_3,AvSigVersion_4,OsVer_1,OsVer_2,OsVer_3,OsVer_4,OsBuildLab_1,OsBuildLab_2,OsBuildLab_3,OsBuildLab_4,OsBuildLab_5,OsBuildLab_6,Census_OSVersion_1,Census_OSVersion_2,Census_OSVersion_3,Census_OSVersion_4
MachineIdentifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
00a3689ca72f3143d5581bde26274795,1,0,7.0,0,53447.0,1.0,1.0,1,214,144987,18,277.0,75,10,1,17134,256,504,55,1.0,0,0,137.0,4,1.0,1.0,2,1,1443.0,256463.0,4.0,5.0,2697.0,953869.0,1,162467.0,0,4096.0,3,14.0,1366.0,768.0,1,0.0,1,1,17134,228,6,8.0,31,1,0,1,3,0.0,1,0.0,355.0,2123.0,1,0.0,0.0,0,0,0.0,0,1,0,0,0,0,0,0,0,1,1,15200,1,4,18,1807,18075,1,275,720,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,228
04ea4e6d5476dcb1842b0476c3c3ccad,1,0,7.0,0,53447.0,1.0,1.0,1,141,165673,27,167.0,227,10,1,10586,768,202,52,1.0,0,0,74.0,4,1.0,1.0,2,1,525.0,331196.0,2.0,5.0,1992.0,476940.0,1,190001.0,0,2048.0,1,15.5,1366.0,768.0,1,14.0,1,7,10586,1176,3,9.0,34,3,0,1,2,0.0,1,0.0,142.0,69812.0,1,0.0,0.0,0,0,0.0,0,10,0,0,0,0,0,0,0,1,1,12902,0,4,9,10586,1106,1,225,2482,0,10,0,0,0,10586.0,1176.0,1,6,170913.0,1848.0,10,0,10586,1176
04e3249c245991c6c7c7e08c435a4a05,1,0,7.0,0,23657.0,2.0,1.0,1,100,167,0,277.0,75,10,1,17134,768,504,52,1.0,0,0,137.0,4,1.0,1.0,2,1,1443.0,256682.0,4.0,5.0,3026.0,953869.0,1,476287.0,0,8192.0,3,15.5,1366.0,768.0,1,0.0,1,1,17134,165,1,8.0,31,1,0,1,2,0.0,1,0.0,355.0,4348.0,1,0.0,0.0,0,0,0.0,0,11,0,0,0,0,0,0,0,1,1,15100,1,4,18,1807,18075,1,273,1826,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,165
049b0b1c03547d39a816143e7873b40e,1,0,5.0,0,53447.0,1.0,1.0,1,43,78906,18,53.0,42,10,1,15063,768,502,52,1.0,0,0,108.0,6,1.0,1.0,1,1,2102.0,111446.0,4.0,5.0,3396.0,476940.0,1,83310.0,0,4096.0,2,23.0,1920.0,1080.0,2,4294967000.0,1,4,15063,1206,4,37.0,158,3,0,1,2,0.0,1,0.0,93.0,19805.0,1,0.0,0.0,0,0,0.0,1,7,0,0,0,0,0,0,0,1,1,15100,1,4,18,1807,18075,1,273,1420,0,10,0,0,0,15063.0,0.0,1,4,170317.0,1834.0,10,0,15063,1206
00a2a84731342e964ef33cc57d6a2c5f,1,0,7.0,0,32113.0,2.0,1.0,1,44,34686,27,57.0,218,10,1,17134,768,504,52,1.0,0,0,137.0,4,1.0,1.0,5,1,585.0,189292.0,4.0,5.0,2393.0,953869.0,1,952728.0,0,4096.0,5,21.5,1920.0,1080.0,2,4294967000.0,1,1,17134,165,7,9.0,34,1,0,1,2,0.0,1,0.0,142.0,56688.0,1,0.0,0.0,0,0,0.0,0,10,0,0,0,0,0,0,0,1,1,15100,1,4,18,1806,18062,1,273,511,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,165
01562540a99c95e7ebea9dd7c25c2273,1,0,7.0,0,53447.0,1.0,1.0,1,188,129332,27,102.0,75,10,1,17134,768,504,52,1.0,0,0,137.0,4,1.0,1.0,2,1,2102.0,241812.0,2.0,1.0,313.0,953869.0,1,937586.0,0,8192.0,1,13.9,1366.0,768.0,1,268.0,1,1,17134,165,1,8.0,31,1,0,1,2,0.0,1,0.0,142.0,33108.0,1,0.0,0.0,0,0,0.0,1,10,0,0,0,0,0,0,0,1,1,15100,1,4,18,1807,18075,1,273,1589,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,165
01c9f1c5db88dc9ec5ac9b3e97b4c3d0,1,0,7.0,0,53447.0,1.0,1.0,1,41,130775,18,52.0,75,10,1,17134,768,504,52,1.0,0,0,137.0,4,1.0,1.0,6,1,2206.0,244535.0,4.0,5.0,3392.0,476940.0,1,454138.0,0,4096.0,1,11.6,1366.0,768.0,1,0.0,1,1,17134,285,2,8.0,31,1,0,1,1,0.0,1,0.0,554.0,33041.0,1,0.0,0.0,0,0,0.0,0,10,0,0,0,0,0,0,0,1,1,15200,1,4,18,1807,18075,1,275,1332,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,285
04f2266f5a4ef87e167768a3bbbdf8e4,1,0,7.0,0,7945.0,2.0,1.0,1,164,131029,27,205.0,172,10,3,15063,768,502,52,1.0,0,0,108.0,6,1.0,1.0,4,1,2427.0,190554.0,4.0,5.0,1848.0,29820.0,2,29110.0,0,2048.0,1,13.9,1366.0,768.0,3,17.0,3,4,15063,1155,3,27.0,120,3,0,1,2,0.0,1,0.0,513.0,23642.0,0,0.0,0.0,0,0,1.0,0,15,0,0,0,0,0,0,0,1,1,15100,1,4,11,15063,1155,1,273,841,0,10,0,0,0,15063.0,0.0,3,4,170317.0,1834.0,10,0,15063,1155
01ed71be4779796e2883ce5f92640d38,1,0,7.0,0,53447.0,1.0,1.0,1,201,66202,0,267.0,251,10,3,17134,256,504,55,1.0,0,0,137.0,6,1.0,1.0,2,1,4142.0,298069.0,2.0,1.0,640.0,305245.0,4,99450.0,0,2048.0,3,14.0,1366.0,768.0,1,0.0,3,1,17134,112,1,35.0,148,3,0,1,1,0.0,1,0.0,142.0,3031.0,0,0.0,0.0,0,0,0.0,0,11,0,0,0,0,0,0,0,1,1,15000,2,4,18,1806,18062,1,271,105,0,10,0,0,0,17134.0,1.0,3,1,180410.0,1804.0,10,0,17134,112
02c3bac7094c4b29260bc8287550373b,1,0,7.0,0,53447.0,1.0,1.0,1,171,43405,0,211.0,182,10,1,17134,768,504,52,1.0,0,0,137.0,4,1.0,1.0,2,1,585.0,189448.0,4.0,5.0,2574.0,953869.0,1,299450.0,0,6144.0,1,15.5,1024.0,768.0,1,0.0,1,1,17134,1,2,29.0,125,2,0,3,1,0.0,2,0.0,556.0,63086.0,0,0.0,0.0,0,0,0.0,0,3,0,0,0,0,0,0,0,1,1,14600,4,4,13,17134,1,1,263,48,0,10,0,0,0,17134.0,1.0,1,1,180410.0,1804.0,10,0,17134,1


In [83]:
# Let's see some details of the loaded data
full_features.describe()

Unnamed: 0,ProductName,IsBeta,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,Platform,Processor,OsBuild,OsSuite,OsPlatformSubRelease,SkuEdition,IsProtected,AutoSampleOptIn,SMode,IeVerIdentifier,SmartScreen,Firewall,UacLuaenable,Census_MDC2FormFactor,Census_DeviceFamily,Census_OEMNameIdentifier,Census_OEMModelIdentifier,Census_ProcessorCoreCount,Census_ProcessorManufacturerIdentifier,Census_ProcessorModelIdentifier,Census_PrimaryDiskTotalCapacity,Census_PrimaryDiskTypeName,Census_SystemVolumeTotalCapacity,Census_HasOpticalDiskDrive,Census_TotalPhysicalRAM,Census_ChassisTypeName,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,Census_PowerPlatformRoleName,Census_InternalBatteryNumberOfCharges,Census_OSArchitecture,Census_OSBranch,Census_OSBuildNumber,Census_OSBuildRevision,Census_OSInstallTypeName,Census_OSInstallLanguageIdentifier,Census_OSUILocaleIdentifier,Census_OSWUAutoUpdateOptionsName,Census_IsPortableOperatingSystem,Census_GenuineStateName,Census_ActivationChannel,Census_IsFlightsDisabled,Census_FlightRing,Census_ThresholdOptIn,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsSecureBootEnabled,Census_IsWIMBootEnabled,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,CORE,EDUCATION,PRO,ENTERPRISE,CLOUD,SERVER,EVALUATION,EngineVersion_1,EngineVersion_2,EngineVersion_3,EngineVersion_4,AppVersion_1,AppVersion_2,AppVersion_3,AppVersion_4,AvSigVersion_1,AvSigVersion_2,AvSigVersion_3,AvSigVersion_4,OsVer_1,OsVer_2,OsVer_3,OsVer_4,OsBuildLab_1,OsBuildLab_2,OsBuildLab_3,OsBuildLab_4,OsBuildLab_5,OsBuildLab_6,Census_OSVersion_1,Census_OSVersion_2,Census_OSVersion_3,Census_OSVersion_4
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,1.010805,0.0,6.848755,0.01709,47889.756405,1.32361,1.021055,0.98768,107.989305,78355.72557,17.21851,169.91366,123.033595,13.114225,1.183005,15719.0964,575.22736,480.16143,52.61569,0.94549,3.5e-05,0.000325,126.62632,4.853005,0.97803,0.99634,2.198125,1.00159,2223.75131,239811.393815,3.98942,4.532185,2372.340935,512680.5,1.45158,380622.2,0.076475,6098.545055,2.107755,16.678684,1547.25791,897.162795,1.41431,1091523000.0,1.182875,2.643305,15833.98193,969.11591,2.94717,14.5761,60.51303,1.885515,0.000685,1.14548,1.59681,1e-05,1.103505,8e-05,397.019835,33017.131025,0.485065,0.0,0.00701,0.125465,0.03785,0.056785,0.27307,7.61229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15075.00075,1.292845,4.0,15.878295,5615.28596,14115.46011,0.999985,272.361735,934.677605,0.0,9.87074,0.075575,0.010895,0.00058,15719.078125,1416.25576,1.182995,3.08056,176493.941285,1776.871665,10.0,0.0,15833.98193,969.11376
std,0.103384,0.0,1.014813,0.129607,13979.860055,0.520632,0.165957,0.11031,62.960237,50373.154228,12.389333,89.285825,69.445434,79.929553,0.57659,2189.429061,248.071764,80.108778,5.968989,0.227022,0.005916,0.018025,42.514752,1.268718,0.146586,0.324356,1.316211,0.039843,1310.47575,72127.920727,2.081557,1.28657,841.160471,359628.4,0.718935,330498.4,0.265757,5062.057175,2.284896,5.924042,367.805509,214.071618,0.751299,1869933000.0,0.576406,2.022969,1959.736244,2916.946097,1.818663,10.17595,44.994523,0.936217,0.026164,0.431273,0.758209,0.003162,0.452805,0.008944,222.576164,21029.631339,0.499778,0.0,0.083432,0.331246,0.190834,0.231432,0.445538,4.698093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.07536,1.017218,0.0,3.318989,6239.168507,7363.971541,0.003873,6.122284,533.229164,0.0,0.707343,0.446816,0.126081,0.188679,2189.427127,4604.132043,0.576576,3.130222,6037.889617,210.566228,0.0,0.0,1959.736244,2916.946652
min,1.0,0.0,0.0,0.0,39.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,7.0,1.0,7600.0,16.0,201.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,74.0,23.0,1.0,1.0,10.0,10680.0,1.0,9385.0,0.0,512.0,1.0,4.9,-1.0,-1.0,1.0,0.0,1.0,1.0,10240.0,0.0,1.0,1.0,5.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,9.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10401.0,0.0,4.0,4.0,204.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,7600.0,0.0,1.0,1.0,90713.0,100.0,10.0,0.0,10240.0,0.0
25%,1.0,0.0,7.0,0.0,49480.0,1.0,1.0,1.0,51.0,31368.0,0.0,89.0,74.0,10.0,1.0,15063.0,256.0,503.0,52.0,1.0,0.0,0.0,111.0,4.0,1.0,1.0,2.0,1.0,1443.0,189788.0,2.0,5.0,1998.0,239372.0,1.0,120826.0,0.0,4096.0,1.0,13.9,1366.0,768.0,1.0,0.0,1.0,1.0,15063.0,165.0,1.0,8.0,31.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,142.0,13299.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15100.0,1.0,4.0,13.0,1807.0,17443.0,1.0,273.0,498.0,0.0,10.0,0.0,0.0,0.0,15063.0,1.0,1.0,1.0,170928.0,1804.0,10.0,0.0,15063.0,165.0
50%,1.0,0.0,7.0,0.0,53447.0,1.0,1.0,1.0,97.0,77866.0,18.0,181.0,88.0,10.0,1.0,16299.0,768.0,503.0,52.0,1.0,0.0,0.0,135.0,4.0,1.0,1.0,2.0,1.0,2102.0,248045.0,4.0,5.0,2503.0,476940.0,1.0,252791.0,0.0,4096.0,1.0,15.5,1366.0,768.0,1.0,0.0,1.0,2.0,16299.0,285.0,3.0,9.0,34.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,486.0,33075.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15100.0,1.0,4.0,18.0,1807.0,18075.0,1.0,273.0,941.0,0.0,10.0,0.0,0.0,0.0,16299.0,1.0,1.0,2.0,180410.0,1804.0,10.0,0.0,16299.0,285.0
75%,1.0,0.0,7.0,0.0,53447.0,2.0,1.0,1.0,160.0,121351.25,27.0,267.0,182.0,10.0,1.0,17134.0,768.0,504.0,55.0,1.0,0.0,0.0,137.0,6.0,1.0,1.0,2.0,1.0,2668.0,308562.0,4.0,5.0,2874.0,953869.0,2.0,476135.2,0.0,8192.0,2.0,17.2,1920.0,1080.0,2.0,4294967000.0,1.0,4.0,17134.0,547.0,4.0,20.0,92.0,3.0,0.0,1.0,2.0,0.0,1.0,0.0,556.0,52246.25,1.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15200.0,1.0,4.0,18.0,10586.0,18075.0,1.0,275.0,1379.0,0.0,10.0,0.0,0.0,0.0,17134.0,431.0,1.0,4.0,180410.0,1834.0,10.0,0.0,17134.0,547.0
max,2.0,0.0,8.0,1.0,70486.0,5.0,4.0,1.0,222.0,167962.0,52.0,296.0,283.0,2016.0,3.0,18242.0,784.0,508.0,90.0,1.0,1.0,1.0,429.0,6.0,1.0,48.0,12.0,2.0,6144.0,345494.0,88.0,10.0,4471.0,22892660.0,4.0,22892040.0,1.0,524288.0,22.0,142.0,5760.0,3840.0,9.0,4294967000.0,3.0,14.0,18242.0,17976.0,9.0,39.0,162.0,6.0,1.0,4.0,6.0,1.0,7.0,1.0,1087.0,72091.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15300.0,6.0,4.0,18.0,17686.0,20063.0,1.0,277.0,4320.0,0.0,10.0,3.0,32.0,72.0,18242.0,24231.0,3.0,42.0,180914.0,2340.0,10.0,0.0,18242.0,17976.0


In [84]:
full_features['UacLuaenable'].unique()

array([ 1.,  0., 48.,  3.,  2.])

In [88]:
full_features.to_csv('./csv/train_v10.csv')

In [90]:
train_count = int(len(full_features) * 0.8)

train_features = full_features.values[:train_count]
test_features  = full_features.values[train_count:]

train_labels = full_labels.values[:train_count]
test_labels = full_labels.values[train_count:]

scaler = StandardScaler()
scaler.fit(train_features)
normalized_train_features = scaler.transform(train_features)
normalized_test_features = scaler.transform(test_features)

clf = ske.HistGradientBoostingClassifier(random_state=123)
clf.fit(normalized_train_features, train_labels)
all_columns_score = clf.score(normalized_test_features, test_labels)
    
print ("All columns (normalized)", train_features.shape, test_features.shape, train_labels.shape, test_labels.shape, "HistGradientBoostingClassifier", all_columns_score*100)


All columns (normalized) (160000, 101) (40000, 101) (160000,) (40000,) HistGradientBoostingClassifier 64.1
