In [1]:
import numpy as np
import pandas as pd
import copy
import matplotlib.pyplot as plt
import datetime
import time
%matplotlib inline

### Loading data

In [2]:
raw_train=pd.read_csv('exercise_06_train.csv')
raw_test=pd.read_csv('exercise_06_test.csv')

In [None]:
raw_train.head()

In [4]:
print(raw_train.shape)

(40000, 101)


In [5]:
### Drop NULL values in column x1
raw_train_drop = raw_train.dropna(axis=0,subset=['y'])

In [6]:
print(raw_train_drop.shape)

(40000, 101)


In [7]:
### Combine train and test data
list_all=[raw_train_drop,raw_test]
raw = pd.concat(list_all,ignore_index=True)

In [8]:
len_train = len(raw_train_drop)
len_test =len(raw_test)
print('The size of effective training and test dataset is', len_train, len_test)

The size of effective training and test dataset is 40000 10000


In [9]:
del(raw_train,raw_test,raw_train_drop)

In [10]:
### Get percentage of NULL values for each feature
Null_list = raw.isnull().sum().sort_values(ascending=False)/float(raw.shape[0])*100
print('the pencentage of NUll value in each features are:', Null_list[:10])

the pencentage of NUll value in each features are: y      20.000
x13     0.034
x55     0.034
x42     0.034
x18     0.032
x62     0.030
x99     0.030
x24     0.030
x96     0.028
x63     0.028
dtype: float64


### Feature Selection

In [11]:
#remove features not used for modeling
del raw['x2']
del raw['x3']
del raw['x19']

#These feature need Nature Language Processing before using, thus increasing the complexity of current model
del raw['x10']
del raw['x16']
del raw['x18']

#Remove redundant feature
del raw['x8']

### Feature Engineering

In [12]:
# Remove $ from dollar amount features
def remove_dollar(x):
    try:
        x = str(x)
        return float(x.strip('$').replace(',',''))
    except:
        return np.nan

In [13]:
raw['x4'] = raw['x4'].apply(remove_dollar)
raw['x5'] = raw['x5'].apply(remove_dollar)
raw['x6'] = raw['x6'].apply(remove_dollar)
raw['x12'] = raw['x12'].apply(remove_dollar)

In [14]:
# Convert the variable format from percentage to float
def per_float(x):
    try:
        x = str(x)
        return float(x.strip('%'))/100
    except:
        return np.nan

In [15]:
raw['x30'] = raw['x30'].apply(per_float)

In [16]:
# Create new features to be used in modeling
raw['x33'] = raw['x5']/raw['x4']
raw['x34'] = raw['x6']/raw['x5']

In [17]:
# Convert time from string format to float (Number of years since 1900-01-01)
def toYears(x):
    try:
        x = datetime.datetime.strptime(x, "%b-%Y")
        x = x-datetime.datetime(1900,1,1)
        return x.days/365.0
    except:
        try:
            x = datetime.datetime.strptime(x, "%b-%y")
            if (x - datetime.datetime(2017,12,31)).days> 0:
                x = x-datetime.datetime(2000,1,2)
                return x.days/365.0
            x = x-datetime.datetime(1900,1,1)
            return x.days/365.0        
        except:
            return np.nan

In [18]:
raw['x15'] = raw['x15'].apply(toYears)
raw['x23'] = raw['x23'].apply(toYears)

In [19]:
# Time difference between issue date and the date opened
raw['x35'] = raw['x15']  - raw['x23']

In [20]:
### Set target variable and remove it from input variable list
raw_y = raw['y']
del raw['y']

### Split input variables into numerical features and categorical features

In [21]:
cat_cols = raw.dtypes[raw.dtypes == 'object'].index
num_cols = raw.dtypes[raw.dtypes == 'float64'].index

### Make statistics analysis on target variables, numerical features and categorical features
### This cell is very slow to run analysis. Don't run unless you're interested in viewing individual features'

In [None]:
# generate histogram for all features
fig= plt.figure(figsize=(5,5))
ax1 = fig.add_subplot(1,1,1)
raw_y.plot(kind= 'hist',axes =ax1)
plt.title('Histgram for interest rate')
plt.show()

for i in cat_cols:
    fig= plt.figure(figsize=(15,5))
    ax1 = fig.add_subplot(1,2,1)
    raw[i].value_counts().plot(kind= 'bar',axes =ax1)
    plt.title('Histgram for feature: %s' %(i))
    ax2 = fig.add_subplot(1,2,2)
    raw[i].value_counts(normalize = 'True').plot(kind= 'bar', axes =ax2)
    plt.title('Histgram for feature: %s (in percentate)' %(i))
    plt.show()

for i in num_cols:
    a = raw[i]
    b = a[abs(a - a.mean()) <=3*a.std()]
    fig= plt.figure(figsize=(8,5))
    b.plot(kind= 'hist',bins = 10)
    plt.title('Histgram of %s' %(i))
    plt.show()

In [23]:
## Make a copy of raw input, will be used later as input variables in the linear regression model 
raw_bp_linear = copy.deepcopy(raw)

### Data preparation for tree model: 

In [24]:
### Replace the Null value with very large number (10**20), let tree model to interpret by itself
for i in num_cols:
    raw[i].fillna(10**20,inplace=True)

In [None]:
### Label encoding for categorical feature
from sklearn import preprocessing
LBL = preprocessing.LabelEncoder()
dict_list = []
for i in cat_cols:
    raw[i] = LBL.fit_transform(raw[i].fillna('0'))
    j = dict(zip(np.arange(len(LBL.classes_)),LBL.classes_))
    k = {i:j}
    dict_list.append(k)
    
# uncomment the following print statement if you want to see the dictionary
#print(dict_list)

In [26]:
### Split into training and test dataset
from sklearn.model_selection import train_test_split
x = raw[:len_train]
y = raw_y[:len_train]
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.33, random_state=42)
holdout_x = raw[len_train:]
holdout_y = raw_y[len_train:]

In [27]:
### Build random forest model

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [28]:
rfr = RandomForestRegressor(n_jobs=-1) 

#### hyper-parameter search: n_estimators
start to build random forest model"
To reduce the code running time, the process of hypermeter grid search for (n_estimator)
which may take one hour or so. Here we only use the final search result
If interested in checking the search process, run the following cell

In [None]:
param_grid = { 
    'n_estimators': [60,120,180]

}

print('start the hypermeter grid search for n_estimator, it may take a few minutes')
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, cv= 5)
CV_rfr.fit(train_x, train_y)

#best_estimators = CV_rfr.best_params_.values()[0]
best_estimators = CV_rfr.best_params_["n_estimators"]

print(' hypermeter grid search is over')
print('The best paramter for n_estimator is:', best_estimators)

In [29]:
### Develop a random forest model with 'n_estimators' = best_estimators

print('random forest model is developing, it may take 10 minutes')
best_estimators = 180
rfr_best = RandomForestRegressor(n_jobs=-1,n_estimators=best_estimators) 
rfr_best.fit(train_x, train_y)

random forest model is developing, it may take 10 minutes


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=180, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [30]:
### Save the model in local disk

from sklearn.externals import joblib
joblib.dump(rfr_best, 'rforest.pkl')



['rforest.pkl']

In [31]:
### Evaluate the training AUC performance of the model
from sklearn.metrics import roc_auc_score

rfr_load = joblib.load('rforest.pkl')
print('random forest model is running')
train_y_pred = rfr_load.predict(train_x)
auc_train = roc_auc_score(train_y, train_y_pred)
print('Training AUC: ', auc_train)

random forest model is running
Training AUC:  1.0


In [32]:
### Make prediction on test data
test_y_pred = rfr_load.predict(test_x)
auc_test = roc_auc_score(test_y, test_y_pred)
print('Testing AUC: ', auc_test)

Testing AUC:  0.9366814302563101
