In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

tqdm.pandas()

def isfloat(x):
    try:
        float(x)
        return True
    except:
        return False
    
def isint(x):
    try:
        int(x)
        return True
    except:
        return False

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

cols = ['Census_InternalBatteryType', 'MachineIdentifier']
train = train.reindex(columns=cols+['Target'])
test = test.reindex(columns=cols)

I first replace any non-alphanumeric characters and then group the outlying minor battery types into one category - 'others'.

I want to take a closer look at https://batteryuniversity.com/learn/article/types_of_battery_cells and estimate a device lifespan

In [3]:
def transform(df):
    df.Census_InternalBatteryType = df.Census_InternalBatteryType.progress_apply(
        lambda x: re.sub('[^0-9a-zA-Z]+', '_', str(x).replace('#', 'pnd')) if pd.notna(x) else np.nan
    )
    return df

train = transform(train)
test = transform(test)

  0%|          | 78/8921483 [00:00<10:57:21, 226.19it/s]
  0%|          | 63/7853253 [00:00<10:18:06, 211.75it/s]


In [4]:
def transform(df):
    others = df.Census_InternalBatteryType.value_counts()
    others = others[others < 184].index.tolist()

    mask = [c in others for c in df.Census_InternalBatteryType]
    df.loc[mask, 'Census_InternalBatteryType'] = 'other'

    df.loc[df.Census_InternalBatteryType.isnull(), 'Census_InternalBatteryType'] = 'missing'
    
    return df

train = transform(train)
test = transform(test)

In [5]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

train_ = train.merge(avsig_combined, on='MachineIdentifier', how='inner', sort=False)
assert len(train_) == len(train)

train = train_
del train_

train = train.sort_values(by='AvSigVersion_float').drop(columns='AvSigVersion_float')

In [6]:
cols = {'Census_InternalBatteryType': 'Census_InternalBatteryType_reduced'}
train = train.rename(columns=cols)
test = test.rename(columns=cols)

In [7]:
train.head()

Unnamed: 0,Census_InternalBatteryType_reduced,MachineIdentifier,Target
0,lion,c3c4bc04dc5f1c7245a862e52634428e,0
37,missing,60031444d3ec616c6e9084be521faa04,0
38,missing,d938abff6012c1488b851247a3098160,0
39,missing,910ddd20c6d334ca03a46d9f0008fe24,1
40,missing,5e05d22ab9db72ccbc8e41d4bc632f64,0


In [8]:
test.head()

Unnamed: 0,Census_InternalBatteryType_reduced,MachineIdentifier
7252423,missing,ec6910b4d9e0baae203e9819227659ec
6804872,missing,ddd66992da9cbb12db76d9d874fedf8b
6882538,missing,e05db268c5f1e48e5fa63de1f39f02d7
6856130,missing,df81a38177efaac6b95df42ddef504e6
2544324,missing,52eb832b198099b467d39481a77afcef


In [9]:
val_idx = int(len(train)*.7)
val_idx

6245038

In [11]:
pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_), overwrite=True)
    except ValueError:
        pass
    
    pbar.update(1)

Census_InternalBatteryType_reduced: 100%|██████████| 1/1 [02:05<00:00, 125.71s/it]