In [11]:
import numpy as np
import pandas as pd
import copy
import datetime
import time
from sklearn import preprocessing
from sklearn.externals import joblib


# Helper function: Remove $ from dollar amount features
def remove_dollar(x):
    try:
        x = str(x)
        return float(x.strip('$').replace(',',''))
    except:
        return np.nan

# Helper function: Convert the variable format from percentage to float
def per_float(x):
    try:
        x = str(x)
        return float(x.strip('%'))/100
    except:
        return np.nan

# Helper function: Convert time from string format to float (Number of years since 1900-01-01)
def toYears(x):
    try:
        x = datetime.datetime.strptime(x, "%b-%Y")
        x = x-datetime.datetime(1900,1,1)
        return x.days/365.0
    except:
        try:
            x = datetime.datetime.strptime(x, "%b-%y")
            if (x - datetime.datetime(2017,12,31)).days> 0:
                x = x-datetime.datetime(2000,1,2)
                return x.days/365.0
            x = x-datetime.datetime(1900,1,1)
            return x.days/365.0        
        except:
            return np.nan

# Preprocess input file
def pre_process(raw):

    # Load data set
    raw=pd.read_csv('test.csv')
    
    # Remove features not used for modeling
    del raw['x2']
    del raw['x3']
    del raw['x19']

    # These feature need Natural Language Processing before using, thus increasing the complexity of current model
    del raw['x10']
    del raw['x16']
    del raw['x18']

    # Remove redundant feature
    del raw['x8']

    # Remove $ from dollar amount features
    raw['x4'] = raw['x4'].apply(remove_dollar)
    raw['x5'] = raw['x5'].apply(remove_dollar)
    raw['x6'] = raw['x6'].apply(remove_dollar)
    raw['x12'] = raw['x12'].apply(remove_dollar)

    # Convert the variable format from percentage to float
    raw['x30'] = raw['x30'].apply(per_float)

    # Create new features to be used in modeling
    raw['x33'] = raw['x5']/raw['x4']
    raw['x34'] = raw['x6']/raw['x5']

    # Convert time from string format to float (Number of years since 1900-01-01)
    raw['x15'] = raw['x15'].apply(toYears)
    raw['x23'] = raw['x23'].apply(toYears)

    # Time difference between issue date and the date opened
    raw['x35'] = raw['x15']  - raw['x23']

    # Split input variables into numerical features and categorical features
    cat_cols = raw.dtypes[raw.dtypes == 'object'].index
    num_cols = raw.dtypes[raw.dtypes == 'float64'].index


    # Replace the Null value with very large number (10**20), let tree model interpret by itself
    for i in num_cols:
        raw[i].fillna(10**20,inplace=True)

    # Label encoding for categorical feature
    LBL = preprocessing.LabelEncoder()
    dict_list = []
    for i in cat_cols:
        raw[i] = LBL.fit_transform(raw[i].fillna('0'))
        j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
        k = {i:j}
        dict_list.append(k)

    return raw




def ml_api(filename,model):
    # load model
    rfr_load = joblib.load(model)

    # read and preprocess input data
    test = pre_process(filename)


    # Make prediction on data set
    test_y_pred = rfr_load.predict(test)
    print('Predictions: ', test_y_pred)

In [13]:
# Machine Learning API takes the dataset and trained model as inputs, and returns predictions
ml_api('test.csv', 'model.pkl')

Predictions:  [0.81666667 0.14444444 0.45       ... 0.06111111 0.06111111 0.18888889]
