In [1]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

from dankypipe import pipe
# import os
# import sys
# sys.path.insert(0, os.path.abspath('..'))
# from dankypipe import pipe as pipe

In [None]:
train = pd.read_pickle('train.pickle').sort_values(by='AvSigVersion')
test = pd.read_pickle('test.pickle').sort_values(by='AvSigVersion')

train = train.rename(columns={'HasDetections':'Target'})

In [None]:
def transform(df):
    df['Census_OSSkuName'] = [re.sub(r'[^a-zA-Z]+', '', s) if isinstance(s, str) else s for s in df.Census_OSSkuName]
    df['Census_OSEdition'] = [re.sub(r'[^a-zA-Z]+', '', s) if isinstance(s, str) else s for s in df.Census_OSEdition]

    # extract the media reduced OS versions
    OS_Reduced_Media = [
        'professionaln',
        'coren',
        'enterprisesn',
        'enterprisen',
        'professionalworkstationn',
        'cloudn',
        'educationn',
        'professionaleducationn'
    ]

    mask = [
        c[0] in OS_Reduced_Media or c[1] in OS_Reduced_Media 
        for c in df[['Census_OSSkuName', 'Census_OSEdition']].itertuples() 
    ]
    df['OS_Reduced_Media'] = mask

    for c in OS_Reduced_Media:
        df.loc[df.Census_OSSkuName == c, 'Census_OSSkuName'] = c[:-1]
        df.loc[df.Census_OSEdition == c, 'Census_OSEdition'] = c[:-1]

    # replace the obvious typo
    df.loc[
        (df.Census_OSEdition == 'enterprises') |
        (df.Census_OSSkuName == 'enterprises'),
        ['Census_OSEdition', 'Census_OSSkuName']
    ] = 'enterprise'


    # There are only one of these in the entire dataset
    df.loc[
        (df.Census_OSEdition == 'professionalsinglelanguage') |
        (df.Census_OSSkuName == 'professionalsinglelanguage'),
        ['Census_OSEdition', 'Census_OSSkuName']
    ] = 'professional'

    df.loc[
        (df.Census_OSEdition == 'professionalcountryspecific') |
        (df.Census_OSSkuName == 'professionalcountryspecific'),
        ['Census_OSEdition', 'Census_OSSkuName']
    ] = 'professional'

    df.loc[
        (df.Census_OSEdition == 'professionalcountryspecific') |
        (df.Census_OSSkuName == 'professionalcountryspecific'),
        ['Census_OSEdition', 'Census_OSSkuName']
    ] = 'professional'

    # look for substring matches
    step, subsets = 4, {}
    for s in df.Census_OSEdition.unique():
        s = str(s)
        subsets[s] = {s[i:i+step] for i in range(len(s)-step+1)}

    df['Census_OSEdSkuMatch'] = [
        any([
            x in z for x in subsets[y]
        ]) if str(y) != 'nan' else False
        for y, z in zip(df.Census_OSEdition, df.Census_OSSkuName)
    ]
    
    osed_props = df.Census_OSEdition.value_counts(normalize=True)
    ossku_props = df.Census_OSSkuName.value_counts(normalize=True)
    
    t = df[['Census_OSEdition', 'Census_OSSkuName', 'Census_OSEdSkuMatch', 'OS_Reduced_Media']]
    for ix, row in df.iloc[
        t.loc[[not b for b in t.duplicated()] & ~t.Census_OSEdSkuMatch].index][
            ['Census_OSEdition', 'Census_OSSkuName', 'Census_OSEdSkuMatch']].iterrows():
        a, b = osed_props[row.Census_OSEdition], ossku_props[row.Census_OSSkuName]
        p = b/(a+b)
        choice = np.random.binomial(1, p, 1)
        
        if choice == 1:
            #print(p, 1, row.Census_OSSkuName)
            df.loc[ix, 'Census_OSEdition'] = row.Census_OSSkuName
        else:
            #print(p, 0, row.Census_OSEdition)
            df.loc[ix, 'Census_OSEdition'] = row.Census_OSEdition
        
    df.drop(columns=['Census_OSSkuName'], inplace=True)
    
    return df

cols  = ['Census_OSSkuName', 'Census_OSEdition', 'MachineIdentifier']
train = train[cols+['Target']]
test  = test[cols]

train = transform(train)
test  = transform(test)

In [None]:
train.head()

In [None]:
test.head()

### Now upload these features

In [None]:
val_idx = int(len(train)*.7)
val_idx

In [None]:
train = train.rename(columns={'Census_OSEdition':'Census_OSEdition_reduced'})
test  = test.rename(columns={'Census_OSEdition':'Census_OSEdition_reduced'})

pbar = tqdm(total=len(train.columns)-2)

for c in train.columns:
    if c == 'MachineIdentifier' or c == 'Target':
        continue
        
    pbar.set_description(c)
    
    train_ = train[['MachineIdentifier', c]].iloc[:val_idx, :]
    val_   = train[['MachineIdentifier', c]].iloc[val_idx:, :]
    test_  =  test[['MachineIdentifier', c]]
    
    try:
        pipe.upload_feature(c, (train_, val_, test_))
    except ValueError:
        pass
    
    pbar.update(1)

### Submit a training job

In [2]:
config = {
    "job_name": "a_small_demo", 
    "features": ["Wdft_IsGamer", "AvSigVersion", "AppVersion", "Census_DeviceFamily",], 
    "model": {
        "name": "lgbm", 
        "parameters": {
            "categorical_features": [
                "AvSigVersion", "AppVersion", "Census_DeviceFamily"
            ], 
            "kwargs": {
                "num_boost_round": 1400, 
                "verbose_eval": 100
            }, 
            "params": {
                "objective": "binary", 
                "metric": "auc", 
                "num_leaves": 10, 
                "learning_rate": 0.2
            }
        }
    }, 
    "task": "predict", 
    "tuning": {
        "metric": "auc", 
        "search_type": "stage_wise", 
        "parameters": {
            "kwargs.num_boost_round": [1000, 1500],
            "params.num_leaves": [8, 12], 
            "params.learning_rate": [0.1, 0.2]
        }
    }
}

In [3]:
job = pipe.Ec2Job(
    config=config,
    overwrite=True,
    ssh_key_path='/home/luke/.ssh/aws_virginia1.pem',
    instance_type='r5.xlarge'
)

Project=DankDefense


In [None]:
pipe.Ec2Job?

In [4]:
results = job.run()

initializing EC2 instance
establishing connection with ec2-3-94-125-239.compute-1.amazonaws.com


  m.add_string(self.Q_C.public_numbers().encode_point())
  self.curve, Q_S_bytes
  hm.add_string(self.Q_C.public_numbers().encode_point())


Thu Feb  7 03:10:20 UTC 2019:  installing python3-pip
Thu Feb  7 03:10:54 UTC 2019:  installing awcli
Thu Feb  7 03:10:58 UTC 2019:  installing the dank pipe
Thu Feb 07 03:11:24 UTC 2019:  building dataset
Thu Feb 07 03:14:57 UTC 2019:
------------
Model Source
import lightgbm as lgb

class Model:
    def __init__(self, parameters):
        self.parameters = parameters
        self.model = None
    def train(self, x, y):
        print(self.parameters)
        for c in self.parameters['categorical_features']:
            x.loc[:, c] = x[c].astype('category')
        lgb_train = lgb.Dataset(x, y)
        self.model = lgb.train(self.parameters['params'], lgb_train, **self.parameters['kwargs'])
    def predict(self, x):
        return self.model.predict(x)
------------end model
no secrets file found. resorting to environment variables
Fetching config...
Downloading features...
Feature set download complete
{'categorical_features': ['AvSigVersion', 'AppVersion', 'Census_DeviceFamily'], 'kwa

In [None]:
results

In [8]:
fs = pipe.build_feature_set(['AppVersion'])

In [15]:
for k, v in fs.items():
    print(k)
    print(v['x'].head())
    print(v['y'].head() if v['y'] is not None else None)
    print()
    print()

train
             AppVersion
6822125  4.9.10586.1106
7285638   4.12.16299.15
5050150  4.9.10586.1106
165482   4.13.17134.228
735046   4.9.10586.1106
                        MachineIdentifier  Target
6822125  c3c4bc04dc5f1c7245a862e52634428e       0
7285638  d106fcb0c6482265956c05ffbaf60744       0
5050150  90eeb2d77a5f58c0afe71de24f29bb50       0
165482   04c1c463cbb6e2bfae34c4c66fd3242c       0
735046   151dd3600408f025207073d09cbc6d5d       0


test
               AppVersion
7255494   4.18.1807.18075
7985649  4.16.17656.18052
7255496   4.18.1807.18075
8171601   4.18.1807.18075
7812640   4.18.1807.18075
None


validate
              AppVersion
7252423     4.13.17134.1
6804872   4.13.17134.320
6882538   4.13.17134.228
6856130      4.18.1809.2
2544324  4.18.1807.18075
                        MachineIdentifier
7252423  ec6910b4d9e0baae203e9819227659ec
6804872  ddd66992da9cbb12db76d9d874fedf8b
6882538  e05db268c5f1e48e5fa63de1f39f02d7
6856130  df81a38177efaac6b95df42ddef504e6
2544324  52

In [6]:
target = pipe.download_feature('Target')

In [7]:
for k, v in target.items():
    print(k)
    print(v.head())

train
                        MachineIdentifier  Target
6822125  c3c4bc04dc5f1c7245a862e52634428e       0
7285638  d106fcb0c6482265956c05ffbaf60744       0
5050150  90eeb2d77a5f58c0afe71de24f29bb50       0
165482   04c1c463cbb6e2bfae34c4c66fd3242c       0
735046   151dd3600408f025207073d09cbc6d5d       0
test
                        MachineIdentifier  Target
7255494  d02af699e3a7618914f8c538baefeb68       0
7985649  e5213bf0841076b663600920aa060c8f       1
7255496  d02af933ab7c06b5b7a74a6c5cd37094       0
8171601  ea787571d0e36842d9866ebd2cc878f5       0
7812640  e029fef2d6199e20e95c030d0f06ccd4       1
validate
                        MachineIdentifier
7252423  ec6910b4d9e0baae203e9819227659ec
6804872  ddd66992da9cbb12db76d9d874fedf8b
6882538  e05db268c5f1e48e5fa63de1f39f02d7
6856130  df81a38177efaac6b95df42ddef504e6
2544324  52eb832b198099b467d39481a77afcef


In [5]:
job.terminate_instance()

{'TerminatingInstances': [{'CurrentState': {'Code': 32,
    'Name': 'shutting-down'},
   'InstanceId': 'i-06da7e73adf66213a',
   'PreviousState': {'Code': 16, 'Name': 'running'}}],
 'ResponseMetadata': {'RequestId': '553cb4cb-3f68-43f8-8739-1361a66835a7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8',
   'transfer-encoding': 'chunked',
   'vary': 'Accept-Encoding',
   'date': 'Thu, 07 Feb 2019 03:16:13 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

In [None]:
del job, results