**Set env variables and create directory for module**

In [208]:
%env PROJECT_ID frauddetectionkaggle
%env BUCKET_ID frauddetectionkagglepkmatt
%env REGION europe-west1
%env TRAINER_PACKAGE_PATH ./fraud_detection_training
%env MAIN_TRAINER_MODULE fraud_detection_training.train
%env JOB_DIR gs://frauddetectionkagglepkm/xgb_job_dir
%env RUNTIME_VERSION 1.14
%env PYTHON_VERSION 3.5
! mkdir fraud_detection_training

env: PROJECT_ID=frauddetectionkaggle
env: BUCKET_ID=frauddetectionkagglepkmatt
env: REGION=europe-west1
env: TRAINER_PACKAGE_PATH=./fraud_detection_training
env: MAIN_TRAINER_MODULE=fraud_detection_training.train
env: JOB_DIR=gs://frauddetectionkagglepkm/xgb_job_dir
env: RUNTIME_VERSION=1.14
env: PYTHON_VERSION=3.5
mkdir: cannot create directory ‘fraud_detection_training’: File exists


**Create a bucket (NB: AI Platform won't work with a multi region bucket!!)**

In [209]:
! gsutil mb -l $REGION gs://$BUCKET_ID
#upload the CSVs to this bucket manually

Creating gs://frauddetectionkagglepkmatt/...
ServiceException: 409 Bucket frauddetectionkagglepkmatt already exists.


**Create training script that downloads, encodes, trains and saves the model**

In [252]:
#%%writefile ./fraud_detection_training/train.py
# [START setup]
import datetime
import pandas as pd
import numpy as np
import re
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.externals import joblib
import xgboost as xgb
from google.cloud import storage
import os
BUCKET_ID = 'frauddetectionkagglepkmatt'


# ---------------------------------------
# 1. Add code to download the data from GCS 
# AI Platform will then be able to use the data when training your model.
# ---------------------------------------
# [START download-data]

identity = 'train_identity_small.csv'
transaction = 'train_transaction_small.csv'

#  bucket holding the data
bucket = storage.Client().bucket(BUCKET_ID)

# Path to the data inside the public bucket
data_dir = 'data/raw/'

if not os.path.exists(identity):
    # Download the data
    blob = bucket.blob(''.join([data_dir, identity]))
    blob.download_to_filename(identity)
    
if not os.path.exists(transaction):    
    blob = bucket.blob(''.join([data_dir, transaction]))
    blob.download_to_filename(transaction)


def load_and_merge_data(transaction_csv,identity_csv,isTrain,nrows=1000000):
    df_transaction = pd.read_csv(transaction_csv, index_col='TransactionID',nrows=nrows)
    df_identity = pd.read_csv(identity_csv, index_col='TransactionID',nrows=nrows)
    df = pd.merge(df_transaction, df_identity, on='TransactionID', how='left')
    del df_transaction
    del df_identity
    if isTrain:
        labels = df[['isFraud']]
        df.pop('isFraud')
    else:
        labels = []
    return df, labels

train,labels  = load_and_merge_data(transaction,identity,isTrain=True)
#train,labels  = load_and_merge_data('gs://frauddetectionkagglepkmatt/data/raw/train_transaction.csv','gs://frauddetectionkagglepkmatt/data/raw/train_identity.csv',isTrain=True,nrows=5000)
# #validate,vallabels  = load_and_merge_data('./data/raw/test_transaction.csv','./data/raw/test_identity.csv',isTrain=False,nrows=5000)

#print(train.shape)

def get_lists_of_numerical_categorical(df,regex):
    #Regex for categorical fields:
    categorical = []
    numerical = []

    #Create lists of categorical and numeircal fields:
    for i in df:
        if re.match(regex, i):
            categorical.append(i)
        else:
            numerical.append(i)
    return numerical,categorical

cat_columns_regex='ProductCD|card[1-6]|addr\d|\w_emaildomain|M[1-9]|time_|Device\w+|id_12|id_13|id_14|id_15|id_16|id_17|id_18|id_19|id_20|id_21|id_22|id_23|id_24|id_25|id_26|id_27|id_28|id_29|id_30|id_31|id_32|id_33|id_34|id_35|id_36|id_37|id_38'
numerical,categorical = get_lists_of_numerical_categorical(train,cat_columns_regex)

def numerically_encode_string_categoricals(df):
    for i in df.columns:
        if df[i].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[i].values) + list(df[i].values))
            df[i] = lbl.transform(list(df[i].values))
    return df
train = numerically_encode_string_categoricals(train)
#validate = numerically_encode_string_categoricals(validate)

# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

train = reduce_mem_usage(train)

#Impute median for numerical and mode for categorical
def impute_cat_and_num(df,numerical,categorical):
    fill_NaN_numerical = Imputer(missing_values=np.nan, strategy='median',axis=1)
    fill_NaN_categorical = Imputer(missing_values=np.nan, strategy='most_frequent',axis=1)
    df[numerical] = fill_NaN_numerical.fit_transform(df[numerical])
    df[categorical] = fill_NaN_categorical.fit_transform(df[categorical])
    return df
train = impute_cat_and_num(train,numerical,categorical)

# [START load-into-dmatrix-and-train]
# load data into DMatrix object
dtrain = xgb.DMatrix(train, labels)

# train model

param_dict = {
    'base_score':0.5,
    'booster':'gbtree',
    'colsample_bylevel':1,
    'colsample_bynode':1,
    'colsample_bytree':0.9,
    'gamma':0,
    'learning_rate':0.05,
    'max_delta_step':0,
    'max_depth':9,
    'min_child_weight':1,
    'n_estimators':1000,#this has to be entered explicitly in the function
    'n_jobs':1,
    'nthread':7,
    'objective':'binary:logistic',
    'random_state':42,
    'reg_alpha':0,
    'reg_lambda':1,
    'scale_pos_weight':1,
    'seed':42,
    'subsample':0.9,
    'tree_method':'auto',
    'verbosity':1
}

clf = xgb.train(param_dict, dtrain, param_dict['n_estimators'])
model = 'model.joblib'
#clf.save_model(model)
# Export the model to a file
joblib.dump(clf, model)
# ---------------------------------------
# 2. Export and save the model to GCS
# ---------------------------------------
bucket = storage.Client().bucket(BUCKET_ID)
blob = bucket.blob('{}/{}'.format(
    datetime.datetime.now().strftime('fraud_detect_kaggle_%Y%m%d_%H%M%S'),
    model))
blob.upload_from_filename(model)

Memory usage of dataframe is 1.65 MB
Memory usage after optimization is: 0.40 MB
Decreased by 75.5%


In [31]:
%%writefile ./fraud_detection_training/__init__.py
# Note that __init__.py can be an empty file.

Writing ./fraud_detection_training/__init__.py


In [39]:
! gcloud config set project $PROJECT_ID

Updated property [core/project].


In [67]:
#%%writefile ./fraud_detection_training/setup.py
#from setuptools import find_packages
#from setuptools import setup
#
#REQUIRED_PACKAGES = ['xgboost']
#
#setup(
#    name='trainer',
#    version='0.1',
#    install_requires=REQUIRED_PACKAGES,
#    packages=find_packages(),
#    include_package_data=True,
#    description='My training application package.'
#)

Writing ./fraud_detection_training/setup.py


In [141]:
! gcloud ai-platform local train \
  --job-dir $JOB_DIR \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE

(499, 432)


In [148]:
! gcloud ai-platform jobs submit training fraud_detection_training_$(date +"%Y%m%d_%H%M%S") \
  --job-dir $JOB_DIR \
  --package-path $TRAINER_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --scale-tier BASIC

Job [fraud_detection_training_20190916_164725] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe fraud_detection_training_20190916_164725

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs fraud_detection_training_20190916_164725
jobId: fraud_detection_training_20190916_164725
state: QUEUED


In [211]:
model = 'model.bst'
#  bucket holding the data
bucket = storage.Client().bucket(BUCKET_ID)
data_dir = 'fraud_detect_kaggle_20190916_185055/'
#gs://frauddetectionkagglepkmatt/fraud_detect_kaggle_20190916_185055/model.bst
# Download the data
blob = bucket.blob(''.join([data_dir, model]))
blob.download_to_filename(model)

In [245]:
clf = load('./model.joblib')

In [264]:
validate,vallabels  = load_and_merge_data('./data/raw/test_transaction.csv','./data/raw/test_identity.csv',isTrain=False)

In [265]:
cat_columns_regex='ProductCD|card[1-6]|addr\d|\w_emaildomain|M[1-9]|time_|Device\w+|id_12|id_13|id_14|id_15|id_16|id_17|id_18|id_19|id_20|id_21|id_22|id_23|id_24|id_25|id_26|id_27|id_28|id_29|id_30|id_31|id_32|id_33|id_34|id_35|id_36|id_37|id_38'
numerical,categorical = get_lists_of_numerical_categorical(validate,cat_columns_regex)
validate = numerically_encode_string_categoricals(validate)
validate = reduce_mem_usage(validate)
validate = impute_cat_and_num(validate,numerical,categorical)
dvalidate = xgb.DMatrix(validate, labels)

Memory usage of dataframe is 1673.87 MB
Memory usage after optimization is: 460.02 MB
Decreased by 72.5%


MemoryError: 

In [258]:
val_pred=clf.predict(dvalidate)

In [239]:
#for API:
#validate = {"instances": validate.values.tolist()}
#for gcloud:
validate = validate.values.tolist()
with open("test.txt", "w") as output:
    output.write(str(validate))

In [240]:
! gcloud ai-platform local predict --model-dir gs://frauddetectionkagglepkmatt/fraud_detect_kaggle_20190917_090223/ \
  --text-instances test.txt \
  --framework xgboost

If the signature defined in the model is not serving_default then you must specify it via --signature-name flag, otherwise the command may fail.
[1;31mERROR:[0m (gcloud.ai-platform.local.predict) Cannot import xgboost. Please verify "python -c 'import xgboost'" works.

