In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe

In [2]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

train = train.reindex(columns=['MachineIdentifier', 'Target'])
test  = test.reindex(columns=['MachineIdentifier'])

In [3]:
train.head()

Unnamed: 0,MachineIdentifier,Target
6822125,c3c4bc04dc5f1c7245a862e52634428e,0
7285638,d106fcb0c6482265956c05ffbaf60744,0
5050150,90eeb2d77a5f58c0afe71de24f29bb50,0
165482,04c1c463cbb6e2bfae34c4c66fd3242c,0
735046,151dd3600408f025207073d09cbc6d5d,0


In [4]:
test.head()

Unnamed: 0,MachineIdentifier
7252423,ec6910b4d9e0baae203e9819227659ec
6804872,ddd66992da9cbb12db76d9d874fedf8b
6882538,e05db268c5f1e48e5fa63de1f39f02d7
6856130,df81a38177efaac6b95df42ddef504e6
2544324,52eb832b198099b467d39481a77afcef


In [5]:
avsig = pipe.download_feature('AvSigVersion_float', cache=True)

In [6]:
for k, v in avsig.items():
    print(k)
    print(v.head())
    print()

train
                  MachineIdentifier  AvSigVersion_float
0  c3c4bc04dc5f1c7245a862e52634428e                 0.0
1  60031444d3ec616c6e9084be521faa04                 0.0
2  d938abff6012c1488b851247a3098160                 0.0
3  910ddd20c6d334ca03a46d9f0008fe24                 0.0
4  5e05d22ab9db72ccbc8e41d4bc632f64                 0.0

validate
                  MachineIdentifier  AvSigVersion_float
0  d5632e2ad42c13b8e63f356f20d7d8bf            275.1509
1  22e6c7ead3ec93be353261c3112eb8e6            275.1509
2  ed1588afa4a556104e476772e396ed5c            275.1509
3  92f9f2cb919773e0758a7760e3f96114            275.1509
4  7b4d9b37aff8cc4c50d25731cd65a47e            275.1509

test
                  MachineIdentifier  AvSigVersion_float
0  ec6910b4d9e0baae203e9819227659ec                 0.0
1  f899923f0a42719f588608018611c8c3                 0.0
2  a7ebe733cd84fa1e029e641c8f3f14fd                 0.0
3  d2137b74e1993cbaf04b365b0130df53                 0.0
4  79bc7126412e62b121be8ad

In [7]:
avsig_combined = pd.concat([avsig['train'], avsig['validate']], sort=False).sort_values(by='AvSigVersion_float')

In [8]:
len(train)

8921483

In [9]:
len(avsig['train'])

6245038

In [10]:
len(avsig['validate'])

2676445

In [11]:
len(avsig_combined)

8921483

In [12]:
combined = train.merge(avsig_combined, on='MachineIdentifier', how='inner')
len(combined)

8921483

In [13]:
combined = combined.sort_values(by='AvSigVersion_float')

In [14]:
val_idx = int(len(combined)*.7)
val_idx

6245038

In [15]:
train_ = combined[['MachineIdentifier', 'Target']].iloc[:val_idx, :]
val_   = combined[['MachineIdentifier', 'Target']].iloc[val_idx:, :]
test_  =  test[['MachineIdentifier']]

In [16]:
train_.head()

Unnamed: 0,MachineIdentifier,Target
0,c3c4bc04dc5f1c7245a862e52634428e,0
37,60031444d3ec616c6e9084be521faa04,0
38,d938abff6012c1488b851247a3098160,0
39,910ddd20c6d334ca03a46d9f0008fe24,1
40,5e05d22ab9db72ccbc8e41d4bc632f64,0


In [17]:
val_.head()

Unnamed: 0,MachineIdentifier,Target
6253882,d5632e2ad42c13b8e63f356f20d7d8bf,0
6253881,22e6c7ead3ec93be353261c3112eb8e6,0
6253880,ed1588afa4a556104e476772e396ed5c,1
6253879,92f9f2cb919773e0758a7760e3f96114,0
6253878,7b4d9b37aff8cc4c50d25731cd65a47e,1


In [18]:
test_.head()

Unnamed: 0,MachineIdentifier
7252423,ec6910b4d9e0baae203e9819227659ec
6804872,ddd66992da9cbb12db76d9d874fedf8b
6882538,e05db268c5f1e48e5fa63de1f39f02d7
6856130,df81a38177efaac6b95df42ddef504e6
2544324,52eb832b198099b467d39481a77afcef


In [19]:
pipe.upload_feature('Target', (train_, val_, test_), overwrite=True)

{'train': '35103ae560f64c488ba2d63b511f90ba',
 'validate': '3eac187b2263a958ce3f2a658353048e',
 'test': '9cf8908fcb17082b974dab478e14756f'}