In [None]:
%matplotlib inline

import os
import pickle
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import roc_curve, auc

from scipy.sparse import hstack 
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

def nanp(df, show_zero=False):
    cols = df.columns
    d, p = len(df), []

    for i, col in enumerate(cols):
        a = sum(pd.isnull(df[col]))
        p.append([col, df[col].dtype, np.round(a/d*100, 1)])
    
    p = pd.DataFrame(p, columns=['Variable', 'DataType', 'PercentNA'])
    
    if not show_zero:
        return p.loc[p['PercentNA'] > 0].sort_values(by='PercentNA', ascending=False)
    else:
        return p.sort_values(by='PercentNA', ascending=False)
    
def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False
    
def dfcols():
    [print(c) for c in sorted(df.columns)]
    
def printcats(c):
    df[c] = df[c].apply(lambda x: str(x).lower() if not pd.isnull(x) else np.nan)
    
    df.loc[
        (df.loc[:, c] == 'unknown') |
        (df.loc[:, c] == 'unspecified') |
        df.loc[:, c].isnull(), c
    ] = np.nan
    
    un = df[c].unique()
    if len(un) < 20:
        print(c, len(c), ':', un)
    else:
        print(c, len(c), ':', ', '.join([str(x) for x in un[:5]]) + ', ...')

def cateval(df, c):
    print('percent na: ', df[c].isnull().mean())
    t = pd.crosstab(df[c], df.HasDetections, normalize='index').sort_values(c)
    t['total_count'] = df[c].value_counts()
    t['normalized'] = t.total_count/t.total_count.sum()
    return t

In [None]:
df = pd.read_csv('traing_sample.csv')

#df.read_csv('train.csv').sample(int(10e5)).to_csv('traing_sample.csv', index=None)

In [None]:
binary_cols = [
    'IsBeta',
    'IsSxsPassiveMode',
    'HasTpm',
    'IsProtected',
    'AutoSampleOptIn',
    'SMode',
    'Firewall',
    'UacLuaenable',
    'Census_HasOpticalDiskDrive',
    'Census_IsPortableOperatingSystem',
    'Census_IsFlightingInternal',
    'Census_IsFlightsDisabled',
    'Census_IsSecureBootEnabled',
    'Census_IsWIMBootEnabled',
    'Census_IsVirtualDevice',
    'Census_IsTouchEnabled',
    'Census_IsPenCapable',
    'Census_IsAlwaysOnAlwaysConnectedCapable',
    'Wdft_IsGamer'
]

for c in tqdm(binary_cols):
    df[c] = df[c].astype(bool)

In [None]:
categorical_cols = [
    'ProductName',
    'AVProductStatesIdentifier',
    'CountryIdentifier',
    'CityIdentifier',
    'OrganizationIdentifier',
    'GeoNameIdentifier',
    'LocaleEnglishNameIdentifier',
    'Platform',
    'Processor',
    'OsSuite',
    'OsBuildLab',
    'SkuEdition',
    'IeVerIdentifier',
    'SmartScreen',
    'Census_MDC2FormFactor',
    'Census_DeviceFamily',
    'Census_OEMNameIdentifier',
    'Census_OEMModelIdentifier',
    'Census_ProcessorManufacturerIdentifier',
    'Census_ProcessorModelIdentifier',
    'Census_PrimaryDiskTypeName',
    'Census_ChassisTypeName',
    'Census_PowerPlatformRoleName',
    'Census_InternalBatteryType',
    'Census_OSArchitecture',
    'Census_OSBranch',
    'Census_OSEdition',
    'Census_OSSkuName',
    'Census_OSInstallTypeName',
    'Census_OSInstallLanguageIdentifier',
    'Census_OSUILocaleIdentifier',
    'Census_OSWUAutoUpdateOptionsName',
    'Census_GenuineStateName',
    'Census_ActivationChannel',
    'Census_FlightRing',
    'Census_FirmwareManufacturerIdentifier',
    'Census_FirmwareVersionIdentifier',
]

for c in tqdm(categorical_cols):
    if c not in df.columns:
        print(c)
        continue
    
    try:
        df[c] = df[c].astype('object')
    except:
        print(c)

In [None]:
numeric_cols = [
    'Census_ThresholdOptIn',
    'Census_InternalBatteryNumberOfCharges',
    'Census_TotalPhysicalRAM',
    'Census_OSBuildNumber',
    'Census_PrimaryDiskTotalCapacity',
    'Census_SystemVolumeTotalCapacity',
    'Census_ProcessorCoreCount',
    'Census_InternalPrimaryDiagonalDisplaySizeInInches',
    'Census_InternalPrimaryDisplayResolutionHorizontal',
    'Census_InternalPrimaryDisplayResolutionVertical',
    'AVProductsEnabled',
    'AVProductsInstalled',
    'RtpStateBitfield',
]

for c in tqdm(numeric_cols):
    df[c] = pd.to_numeric(df[c])

In [None]:
drop = [
    'PuaMode',
    'Census_ProcessorClass',
    'DefaultBrowsersIdentifier',
    'Wdft_RegionIdentifier',
]
df = df.drop(columns=drop, errors='ignore')

### Extract major and minor versions from hierarchical version strings

In [None]:
df['MajorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[2]))
df['MinorEngineVersion'] = df.EngineVersion.apply(lambda x: int(x.split('.')[3]))
df['EngineVersion'] = df.EngineVersion.apply(lambda x: float('.'.join(x.split('.')[2:])))

numeric_cols.append('MajorEngineVersion')
numeric_cols.append('MinorEngineVersion')

In [None]:
df['MajorAppVersion'] = df.AppVersion.apply(lambda x: int(x.split('.')[1]))

numeric_cols.append('MajorAppVersion')

In [None]:
df['MinorAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[2])

mlen = np.max([len(v) for v in df['MinorAppVersion']])
df['MinorAppVersion'] = df.MinorAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

numeric_cols.append('MinorAppVersion')

In [None]:
df['FinestAppVersion'] = df.AppVersion.apply(lambda x: x.split('.')[3])

mlen = np.max([len(v) for v in df['FinestAppVersion']])
df['FinestAppVersion'] = df.FinestAppVersion.apply(lambda x: int(f'1{x.zfill(mlen)}'))

numeric_cols.append('FinestAppVersion')

In [None]:
df['AppVersion'] = [
    float(f'{t[0]}.{t[1]}{t[2]}') for t in df[
        ['MajorAppVersion', 'MinorAppVersion', 'FinestAppVersion']
    ].itertuples()
]

In [None]:
df['MajorAvSigVersion'] = df.AvSigVersion.apply(lambda x: int(x.split('.')[1]))
df['MinorAvSigVersion'] = df.AvSigVersion.apply(lambda x: int(x.split('.')[2]))
df['AvSigVersion'] = df.AvSigVersion.apply(lambda x: float('.'.join(x.split('.')[1:3])))

numeric_cols.append('MajorAvSigVersion')
numeric_cols.append('MinorAvSigVersion')

In [None]:
df['Census_MajorOSVersion'] = df.Census_OSVersion.apply(lambda x: int(x.split('.')[2]))
df['Census_MinorOSVersion'] = df.Census_OSVersion.apply(lambda x: int(x.split('.')[3]))
df['Census_OSVersion'] = df.Census_OSVersion.apply(lambda x: float('.'.join(x.split('.')[2:])))

numeric_cols.append('Census_MajorOSVersion')
numeric_cols.append('Census_MinorOSVersion')

### Clean

In [None]:
for c in categorical_cols:
    printcats(c)

#### `Identifiers`

In [None]:
identifiers = [
    'Census_ProcessorModelIdentifier',
    'Census_FirmwareManufacturerIdentifier',
    'Census_FirmwareVersionIdentifier',
    'Census_OEMNameIdentifier',
    'Census_OEMModelIdentifier',
    'Census_OSInstallLanguageIdentifier',
    'IeVerIdentifier',
    'Census_ProcessorManufacturerIdentifier',
    'Census_ProcessorModelIdentifier',
    'AVProductStatesIdentifier',
    'OrganizationIdentifier',
    'CityIdentifier'
]
for c in identifiers:
    df[c] = df[c].fillna('missing')

In [None]:
nanp(df)

#### `Census_FlightRing`

In [None]:
df.Census_FlightRing = df.Census_FlightRing.fillna('missing')
cateval(df, 'Census_FlightRing')

#### `Census_PowerPlatformRoleName`

In [None]:
df.Census_PowerPlatformRoleName = df.Census_PowerPlatformRoleName.fillna('missing')
cateval(df, 'Census_PowerPlatformRoleName')

#### `Census_OSWUAutoUpdateOptionsName`

In [None]:
df.Census_OSWUAutoUpdateOptionsName = df.Census_OSWUAutoUpdateOptionsName.fillna('missing')
cateval(df, 'Census_OSWUAutoUpdateOptionsName')

#### `Census_GenuineStateName`

In [None]:
df.Census_GenuineStateName = df.Census_GenuineStateName.fillna('missing')
cateval(df, 'Census_GenuineStateName')

#### `SmartScreen`

Fix the ascii characters in smart screen. Why did these report as 'start of heading' and 'start of text'?

Could possible convert to an ordinal variable using a bit of logic where 0 is least secure and n is most.

In [None]:
df.loc[
    (df.SmartScreen == '&#x01;') |
    (df.SmartScreen == '&#x02;'), 
    'SmartScreen'
] = 'invalid'
df.SmartScreen = df.SmartScreen.fillna('missing')

cateval(df, 'SmartScreen')

#### `Census_InternalBatteryType`

I first replace any non-alphanumeric characters and then group the outlying minor battery types into one category - 'others'.

I want to take a closer look at https://batteryuniversity.com/learn/article/types_of_battery_cells and estimate a device lifespan

In [None]:
df.Census_InternalBatteryType = df.Census_InternalBatteryType.progress_apply(
    lambda x: re.sub('[^0-9a-zA-Z]+', '_', str(x).replace('#', 'pnd')) if pd.notna(x) else np.nan
)

In [None]:
others = df.Census_InternalBatteryType.value_counts()
others = others[others < 184].index.tolist()

mask = [c in others for c in df.Census_InternalBatteryType]
df.loc[mask, 'Census_InternalBatteryType'] = 'other'

df.loc[df.Census_InternalBatteryType.isnull(), 'Census_InternalBatteryType'] = 'missing'

cateval(df, 'Census_InternalBatteryType')

#### `Census_OSEdition` and `Census_OSSkuName`

These two features express the same information but differ on occasion. I first remove any non-alphanumeric characters from both features. Next, I check to see if they match by seeing if any substring of length four exists in the other. I do it like this because many are the same but with different orderings (datacenterserver and server_datacenter for example).

I also extract the OS versions that have reduced media applications by default.
https://www.howtogeek.com/322112/what-is-an-n-or-kn-edition-of-windows/

In [None]:
df['Census_OSSkuName'] = [re.sub(r'[^a-zA-Z]+', '', s) for s in df.Census_OSSkuName]
df['Census_OSEdition'] = [re.sub(r'[^a-zA-Z]+', '', s) for s in df.Census_OSEdition]

# extract the media reduced OS versions
OS_Reduced_Media = [
    'professionaln',
    'coren',
    'enterprisesn',
    'enterprisen',
    'professionalworkstationn',
    'cloudn',
    'educationn',
    'professionaleducationn'
]

mask = [
    c[0] in OS_Reduced_Media or c[1] in OS_Reduced_Media 
    for c in df[['Census_OSSkuName', 'Census_OSEdition']].itertuples() 
]
df['OS_Reduced_Media'] = mask

for c in OS_Reduced_Media:
    df.loc[df.Census_OSSkuName == c, 'Census_OSSkuName'] = c[:-1]
    df.loc[df.Census_OSEdition == c, 'Census_OSEdition'] = c[:-1]

# replace the obvious typo
df.loc[
    (df.Census_OSEdition == 'enterprises') |
    (df.Census_OSSkuName == 'enterprises'),
    ['Census_OSEdition', 'Census_OSSkuName']
] = 'enterprise'


# There are only one of these in the entire dataset
df.loc[
    (df.Census_OSEdition == 'professionalsinglelanguage') |
    (df.Census_OSSkuName == 'professionalsinglelanguage'),
    ['Census_OSEdition', 'Census_OSSkuName']
] = 'professional'

df.loc[
    (df.Census_OSEdition == 'professionalcountryspecific') |
    (df.Census_OSSkuName == 'professionalcountryspecific'),
    ['Census_OSEdition', 'Census_OSSkuName']
] = 'professional'

df.loc[
    (df.Census_OSEdition == 'professionalcountryspecific') |
    (df.Census_OSSkuName == 'professionalcountryspecific'),
    ['Census_OSEdition', 'Census_OSSkuName']
] = 'professional'

# look for substring matches
step, subsets = 4, {}
for s in df.Census_OSEdition.unique():
    subsets[s] = {s[i:i+step] for i in range(len(s)-step+1)}

df['Census_OSEdSkuMatch'] = [
    any([
        x in z for x in subsets[y]
    ])
    for y, z in zip(df.Census_OSEdition, df.Census_OSSkuName)
]
t = df[['Census_OSEdition', 'Census_OSSkuName', 'Census_OSEdSkuMatch', 'OS_Reduced_Media']]

print('no match')
t.loc[[not b for b in t.duplicated()] & ~t.Census_OSEdSkuMatch]

print()
print('matches')
t.loc[[not b for b in t.duplicated()] & t.Census_OSEdSkuMatch].head(10)

We see that there are very few non-matching columns and I'm assuming one of them was entered incorrectly. To fix it, I calculate the probabilities of either and choose one at random. I drop the second column after making updates to the first.

In [None]:
(df.Census_OSEdition.isnull() | df.Census_OSSkuName.isnull()).mean()
df.Census_OSEdSkuMatch.mean()
osed_props = df.Census_OSEdition.value_counts(normalize=True)
ossku_props = df.Census_OSSkuName.value_counts(normalize=True)

In [None]:
for ix, row in df.iloc[
    t.loc[[not b for b in t.duplicated()] & ~t.Census_OSEdSkuMatch].index][
        ['Census_OSEdition', 'Census_OSSkuName', 'Census_OSEdSkuMatch']].iterrows():
    a, b = osed_props[row.Census_OSEdition], ossku_props[row.Census_OSSkuName]
    p = b/(a+b)
    choice = np.random.binomial(1, p, 1)
    if choice == 1:
        #print(p, 1, row.Census_OSSkuName)
        df.loc[ix, 'Census_OSEdition'] = row.Census_OSSkuName
    else:
        #print(p, 0, row.Census_OSEdition)
        df.loc[ix, 'Census_OSEdition'] = row.Census_OSEdition
        
df.drop(columns=['Census_OSSkuName'], inplace=True)
categorical_cols.remove('Census_OSSkuName')

In [None]:
cateval(df, 'Census_OSEdition')

#### `OSPlatformSubRelease` and `Census_OSBranch`

In [None]:
def branch_ver(x):
    m = re.search(r'[0-9_]', x)
    idx = m.span()[0] if m is not None else len(x)
    return x[:idx]

t = df[['Census_OSBranch', 'OsPlatformSubRelease']].copy()
t.columns = ['branch', 'subrel']

t.branch = t.branch.apply(lambda x: x.replace('release', ''))
t['branch_ver'] = [branch_ver(x) for x in t.branch]
t['subrel_ver'] = [branch_ver(x) for x in t.subrel]

t['subrel_ver_num'] = [re.sub(r'[^0-9.]', '', c) for c in t.subrel]
t['subrel_ver_num'] = [
    np.round(float(x), 1) if isfloat(x) else np.nan for x in t.subrel_ver_num
]

t['branch_release_num'] = [re.sub(r'[^0-9.]', '', c) for c in t.branch] 
t['branch_release_num'] = [
    np.round(float(x[0]), 1) if len(x) > 0 and isfloat(x[0]) else np.nan for x in t.branch_release_num
]

t['is_svc_release'] = ['svc' in c for c in t.branch]
t['is_escrow_release'] = ['escrow' in c for c in t.branch]
t['is_sec_release'] = ['sec' in c for c in t.branch]
t['is_st1_release'] = ['st1' in c for c in t.branch]
t['is_prerelease'] = ['pre' in c for c in t.branch]
t['is_special_release'] = [
    any([y in c for y in ['flt', 'sigma', 'edge']]) 
    for c in t.branch
]

t.loc[t.subrel_ver == 'prers', 'is_prerelease'] = True
t.loc[t.subrel_ver == 'prers', 'subrel_ver'] = 'rs'

t.loc[['win' in c for c in t.branch_ver], 'branch_ver'] = 'win'

t.loc[t.branch_release_num.isnull(), 'branch_release_num'] = 0

t.drop(columns=['branch', 'subrel'], inplace=True)
t.head()
t.branch_ver.value_counts()
t.subrel_ver.value_counts()

for c in t.columns:
    df[c] = t[c]

categorical_cols += ['branch_ver', 'subrel_ver']
numeric_cols += ['subrel_ver_num', 'branch_release_num']
binary_cols += ['is_svc_release', 'is_escrow_release', 'is_sec_release', 'is_st1_release', 'is_prerelease', 'is_special_release']

df = df.drop(columns=['Census_OSBranch', 'OsPlatformSubRelease'])
categorical_cols.remove('Census_OSBranch')

#### `Census_MDC2_FormFactor` and `Census_ChassisTypeName`

https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.2.0.pdf

In [None]:
t = df[['Census_MDC2FormFactor', 'Census_ChassisTypeName']]

ff_int = {
    'other':1,
    'unkown':2,
    'desktop':3,
    'lowprofiledesktop':4,
    'pizzabox':5,
    'minitower':6,
    'tower':7,
    'portable':8,
    'laptop':9,
    'notebook':10,
    'handheld':11,
    'dockingstation':12,
    'allinone':13,
    'subnotebook':14,
    'spacesaving':15,
    'lunchbox':16,
    'mainserverchassis':17,
    'expansionchassis':19,
    'subchassis':20,
    'busexpansionchassis':21,
    'peripheralchassis':22,
    'raidchassis':23,
    'rackmountchassis':24,
    'sealedcasepc':25,
    'multisystemchassis':26,
    'compactpci':27,
    'advancedtca':28,
    'blade':29,
    'bladeenclosure':30,
    'tablet':31,
    'convertible':32,
    'detachable':33,
    'iotgateway':34,
    'embeddedpc':35,
    'minipc':36,
    'stickpc':37
}
int_ff = { v:k for k, v in ff_int.items() }

mask = [isint(x) and x in int_ff.keys() for x in df.Census_MDC2FormFactor]
df.loc[mask, 'Census_MDC2FormFactor'] = [int_ff[int(x)] for x in df.loc[mask, 'Census_MDC2FormFactor']]

mask = [isint(x) and x in int_ff.keys() for x in df.Census_ChassisTypeName]
df.loc[mask, 'Census_ChassisTypeName'] = [int_ff[int(x)] for x in df.loc[mask, 'Census_ChassisTypeName']]

df.loc[[c not in ff_int.keys() for c in df.Census_MDC2FormFactor], 'Census_MDC2FormFactor'] = 'invalid'
df.loc[[c not in ff_int.keys() for c in df.Census_ChassisTypeName], 'Census_ChassisTypeName'] = 'invalid'

In [None]:
step, subsets = 5, {}
for s in df.Census_MDC2FormFactor.unique():
    subsets[s] = {s[i:i+step] for i in range(len(s)-step+1)}

df['Census_FFMatch'] = [
    any([
        x in str(z) for x in subsets[y]
    ])
    for y, z in zip(df.Census_MDC2FormFactor, df.Census_ChassisTypeName)
]
t = df[['Census_MDC2FormFactor', 'Census_ChassisTypeName', 'Census_FFMatch']]

print('no match')
t.loc[[not b for b in t.duplicated()] & ~t.Census_FFMatch].head(10)

print()
print('matches')
t.loc[[not b for b in t.duplicated()] & t.Census_FFMatch].head(10)

In [None]:
cateval(df, 'Census_MDC2FormFactor')
cateval(df, 'Census_ChassisTypeName')

#### `OsVer`

OsVer has an ordering we can take advantage of: https://docs.microsoft.com/en-us/windows/desktop/sysinfo/operating-system-version

In [None]:
os_versions = {
    k:v for v, k in enumerate(sorted(df.OsVer.unique(), reverse=True))
}
for k, v in os_versions.items():
    df.loc[df['OsVer']==k, 'OsVer'] = v
    
numeric_cols.append('OsVer')

In [None]:
df.OsVer.isnull().mean()

#### `Census_PrimaryDiskType`

In [None]:
disk_types = {
    'hdd':0,
    'ssd':1
}
for k, v in disk_types.items():
    df.loc[df.Census_PrimaryDiskTypeName == k, 'Census_PrimaryDiskTypeName'] = v
    
df.Census_PrimaryDiskTypeName = pd.to_numeric(df.Census_PrimaryDiskTypeName)

In [None]:
cateval(df, 'Census_PrimaryDiskTypeName')

In [None]:
t = df.corr().Census_PrimaryDiskTypeName.sort_values()
t.loc[np.abs(t) > .1]

In [None]:
df_ = df.drop(columns=['MachineIdentifier'], errors='ignore').copy()
df_ = df_.dropna(subset=list(set(df_.columns)-{'Census_PrimaryDiskTypeName'}))

categorical_cols_ = list(set(categorical_cols) - {'Census_PrimaryDiskTypeName'})
for c in tqdm(categorical_cols_):
    df_[c] = df_[c].astype(str)

mask = df_.Census_PrimaryDiskTypeName.isnull()

x_train = df_.loc[~mask]
x_pre = df_.loc[mask]
x_pre_idx = x_pre.index.tolist()

y_train = x_train.Census_PrimaryDiskTypeName

x_train = x_train.drop(columns=['Census_PrimaryDiskTypeName'])
x_pre = x_pre.drop(columns=['Census_PrimaryDiskTypeName'])

In [None]:
laEncs = {}
for c in tqdm(categorical_cols_):
    enc = LabelEncoder().fit(df_[c])
    
    x_train[c] = enc.transform(x_train[c])
    x_pre[c] = enc.transform(x_pre[c])
    
    laEncs[c] = enc

In [None]:
name = 'census_primary_disk_type.model'

if os.path.exists(name):
    with open(name, 'rb') as f:
        cv = pickle.loads(f.read())
else:
    # setup the cross-validation scheme
    params = {
        'learning_rate':     [0.2],  # np.linspace(0.1, .3, 5),
        'n_estimators':      [1500], # [100, 1000, 1500, 2000, 3000],
        'max_depth':         [10],   # range(5, 15, 5),
        'min_samples_split': [600],  # range(200, 1000, 200),
        'min_samples_leaf':  [10],   # range(10, 50, 8)
    }
    
    cv = GridSearchCV(
        GradientBoostingClassifier(max_features='sqrt'), 
        params, 
        cv=3, 
        n_jobs=-1, 
        verbose=10,
        refit=True
    )

    # fit
    cv = cv.fit(x_train, y_train)
    
    # save
    with open(name, 'wb') as f:
        f.write(pickle.dumps(cv))

print(cv.best_params_)
print(cv.best_score_)

In [None]:
y_pre = cv.predict(x_pre)
df.loc[x_pre_idx, 'Census_PrimaryDiskTypeName'] = y_pre

In [None]:
df.loc[df.Census_PrimaryDiskTypeName.isnull(), 'Census_PrimaryDiskTypeName'] = 

cateval(df, 'Census_PrimaryDiskTypeName')

#### `RtpStateBitfield`

In [None]:
df.loc[df.RtpStateBitfield.isnull(), 'RtpStateBitfield'] = 34
cateval(df, 'RtpStateBitfield')

In [None]:
df.RtpStateBitfield = [
    str(bin(int(float(c))))[2:].zfill(4) if str(c) != 'nan' else np.nan 
    for c in df.RtpStateBitfield
]

df.RtpStateBitfield.head()

In [None]:
for i in range(1, 5):
    cname = f'RtpStateBitfield_{i}'
    df[cname] = df.RtpStateBitfield.apply(
        lambda x: bool(x[4-i]) if str(x) != 'nan' else np.nan
    )
    binary_cols.append(cname)

df = df.drop(columns=['RtpStateBitfield'])

In [None]:
ex_set = [
    'Census_InternalBatteryType',
    'Census_ThresholdOptIn',
    'SmartScreen',
    'OrganizationIdentifier',
    'MachineIdentifier'
]

#list(set(t.loc[t.DataType == 'object'].Variable.tolist()) - set(ex_set))

### Fill missing values

#### Binary Columns - fill with logistic regression
https://www.sciencedirect.com/science/article/pii/S0166218X11000503

Make sure I have all the features subsetted into _categorical_, _binary_, or _numeric_

In [None]:
for c in df.columns:
    if not (c in categorical_cols or c in binary_cols or c in numeric_cols):
        print(c)

In [None]:
crr = df.corr()

In [None]:
crr.Census_PrimaryDiskTypeName

In [None]:
encoders = {}
for c in categorical_cols[:1]:
    f = [[fi, i] for i, fi in enumerate(df[c].value_counts().index)]
    encoders[c] = OneHotEncoder(handle_unknown='ignore').fit(df[c])
    print(encoders[c].transform(df[c].values))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV


np.random.seed(42)


in_set = [c for c in df.columns if c not in ex_set]

x = df.loc[~mask, in_set].dropna(how='any')
y = x.Census_PrimaryDiskTypeName.values

x = x.drop(columns=['Census_PrimaryDiskTypeName']).values


#     enc = OneHotEncoder().fit(f)
#     encoders[c] = enc
#     df[c] = enc.transform(f)

In [None]:
params = {
    'n_estimators':range(50, 150, 10),
#     'max_depth':range(5, 16, 2),
#     'min_samples_leaf':range(10, 50, 10),
#     'min_samples_split':range(100, 400, 100),
}

cv = GridSearchCV(
    RandomForestClassifier(
        #n_estimators=90,
        #min_samples_split=200,
        #min_samples_leaf=20,
        #max_depth=15
    ),
    params, 
    cv=3, 
    n_jobs=-1, 
    verbose=10
)
cv = cv.fit(x, y)

# print the best parameters and score
cv.best_params_, cv.best_score_

In [None]:
df.Census_InternalBatteryType.value_counts()