In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

cols = ['Census_OSBranch', 'OsPlatformSubRelease', 'MachineIdentifier']
train = train.reindex(columns=cols+['Target'])
test = test.reindex(columns=cols)

I first replace any non-alphanumeric characters and then group the outlying minor battery types into one category - 'others'.

I want to take a closer look at https://batteryuniversity.com/learn/article/types_of_battery_cells and estimate a device lifespan

In [3]:
def transform(df):
    def branch_ver(x):
        m = re.search(r'[0-9_]', x)
        idx = m.span()[0] if m is not None else len(x)
        return x[:idx]

    t = df[['Census_OSBranch', 'OsPlatformSubRelease']].copy()
    t.columns = ['branch', 'subrel']

    t.branch = t.branch.apply(lambda x: x.replace('release', ''))
    t['branch_ver'] = [branch_ver(x) for x in t.branch]
    t['subrel_ver'] = [branch_ver(x) for x in t.subrel]

    t['subrel_ver_num'] = [re.sub(r'[^0-9.]', '', c) for c in t.subrel]
    t['subrel_ver_num'] = [
        np.round(float(x), 1) if isfloat(x) else np.nan for x in t.subrel_ver_num
    ]

    t['branch_release_num'] = [re.sub(r'[^0-9.]', '', c) for c in t.branch] 
    t['branch_release_num'] = [
        np.round(float(x[0]), 1) if len(x) > 0 and isfloat(x[0]) else np.nan for x in t.branch_release_num
    ]

    t['is_svc_release'] = ['svc' in c for c in t.branch]
    t['is_escrow_release'] = ['escrow' in c for c in t.branch]
    t['is_sec_release'] = ['sec' in c for c in t.branch]
    t['is_st1_release'] = ['st1' in c for c in t.branch]
    t['is_prerelease'] = ['pre' in c for c in t.branch]
    t['is_special_release'] = [
        any([y in c for y in ['flt', 'sigma', 'edge']]) 
        for c in t.branch
    ]

    t.loc[t.subrel_ver == 'prers', 'is_prerelease'] = True
    t.loc[t.subrel_ver == 'prers', 'subrel_ver'] = 'rs'

    t.loc[['win' in c for c in t.branch_ver], 'branch_ver'] = 'win'

    t.loc[t.branch_release_num.isnull(), 'branch_release_num'] = 0

    t.drop(columns=['branch', 'subrel'], inplace=True)
    t.head()
    t.branch_ver.value_counts()
    t.subrel_ver.value_counts()

    for c in t.columns:
        df[c] = t[c]

    df = df.drop(columns=['Census_OSBranch', 'OsPlatformSubRelease'])
    
    return df

train = transform(train)
test = transform(test)

In [4]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

train_ = train.merge(avsig_combined, on='MachineIdentifier', how='inner', sort=False)
assert len(train_) == len(train)

train = train_
del train_

train = train.sort_values(by='AvSigVersion_float').drop(columns='AvSigVersion_float')

In [5]:
train.head()

Unnamed: 0,MachineIdentifier,Target,branch_ver,subrel_ver,subrel_ver_num,branch_release_num,is_svc_release,is_escrow_release,is_sec_release,is_st1_release,is_prerelease,is_special_release
0,c3c4bc04dc5f1c7245a862e52634428e,0,th,th,2.0,2.0,False,False,True,False,False,False
37,60031444d3ec616c6e9084be521faa04,0,rs,rs,4.0,4.0,False,False,False,False,False,False
38,d938abff6012c1488b851247a3098160,0,rs,rs,4.0,4.0,False,False,False,False,False,False
39,910ddd20c6d334ca03a46d9f0008fe24,1,rs,rs,4.0,4.0,False,False,False,False,False,False
40,5e05d22ab9db72ccbc8e41d4bc632f64,0,th,th,1.0,1.0,False,False,False,True,False,False


In [6]:
test.head()

Unnamed: 0,MachineIdentifier,branch_ver,subrel_ver,subrel_ver_num,branch_release_num,is_svc_release,is_escrow_release,is_sec_release,is_st1_release,is_prerelease,is_special_release
7252423,ec6910b4d9e0baae203e9819227659ec,rs,rs,4.0,4.0,False,False,False,False,False,False
6804872,ddd66992da9cbb12db76d9d874fedf8b,rs,rs,4.0,4.0,False,False,False,False,False,False
6882538,e05db268c5f1e48e5fa63de1f39f02d7,rs,rs,4.0,4.0,False,False,False,False,False,False
6856130,df81a38177efaac6b95df42ddef504e6,rs,rs,4.0,4.0,False,False,False,False,False,False
2544324,52eb832b198099b467d39481a77afcef,rs,rs,5.0,5.0,False,False,False,False,True,False


In [7]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [8]:
pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_), overwrite=True)
    except ValueError:
        pass
    
    pbar.update(1)

is_special_release: 100%|██████████| 10/10 [21:07<00:00, 123.24s/it]