In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
%matplotlib inline
#matplotlib.rcParams['figure.figsize'] = (6, 4)

# ignore pandas warnings
import warnings
warnings.simplefilter('ignore')

import time
start = time.time()

In [2]:
# load data
data = pd.read_csv('training_ultrasound.csv')

# remove agedays > 0 ( we just only focus pre-birth measurements)
data = data[data['AGEDAYS']<0]

# drop rows with missing data in any of the 5 main columns
ultrasound = ['HCIRCM', 'ABCIRCM', 'BPDCM', 'FEMURCM']
target = 'BWT_40'
data.dropna(subset=ultrasound+[target], inplace=True)

# correct faulty data
data.loc[data['STUDYID']==2, 'PARITY'] = data.loc[data['STUDYID']==2, 'PARITY'] + 1

## Model

In [3]:
# select basic vars
df = data[ultrasound + ['GAGEDAYS', 'SEXN', 'PARITY', 'GRAVIDA'] + [target]]

In [4]:
df.isnull().sum()

HCIRCM        0
ABCIRCM       0
BPDCM         0
FEMURCM       0
GAGEDAYS      0
SEXN          0
PARITY      101
GRAVIDA     101
BWT_40        0
dtype: int64

In [5]:
# there is missing data for parity and gravida: this happens for first pregnancy --> fill with 1s
df.fillna(1, inplace=True)

# replace sex values to 0 and 1
df['SEXN'] = df['SEXN'].replace([1,2], [0,1])

### Feature engineering 

In [6]:
# aspect ratio: measure of the slenderness
df['femur/abd'] = df['FEMURCM']/df['ABCIRCM']

# excentricity of the cenithal snapshot of the head (similar to ellipse)
df['head'] = df['HCIRCM'] / df['BPDCM']

# proxy for head volume
df['vol'] = (df['BPDCM'])**3

# body as a cilinder of radius ABCIRCM and height FEMURCM
df['cilinder'] =(df['ABCIRCM']**2)*df['FEMURCM']

# full interaction term for the 4 measurements
df['four'] = df['HCIRCM']*df['BPDCM']*df['ABCIRCM']*df['FEMURCM']

# femur length scaled with time
df['femur_temp'] = (df['FEMURCM']/df['GAGEDAYS'])

# head-femur polynomial interaction term
df['head*femur'] = df['head']*df['FEMURCM']

# no of past pregancies
df['past_gest'] = df['PARITY'] - df['GRAVIDA']

In [7]:
# common models for sonographic fetal weight estimation use log of the weight
df['BWT_40'] = np.log(1 + df['BWT_40'])

In [8]:
print('Dataframe size: %s,%s' % (df.shape[0],df.shape[1]))

Dataframe size: 7928,17


In [9]:
# sklearn imports
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error

from aux_fun import plot_learning_curve, plot_validation_curve

### Split train/test data

In [10]:
# df to np arrays
X = df.drop(target,axis=1).values

Y = df[target].values

# train-test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

### Define model pipeline

In [11]:
from xgboost import XGBRegressor
xgb = XGBRegressor()

#### CV strategy

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from aux_fun import report

In [13]:
kf = KFold(n_splits=10,random_state=0)

In [14]:
params_grid = {
    'max_depth': [8],
    'subsample': np.arange(0.7,1.0,0.1),
    'learning_rate': np.arange(0.02,0.1,0.01),
    'n_estimators': np.arange(50,1000,200)
}

random_search = RandomizedSearchCV(xgb, param_distributions=params_grid, n_iter=50, 
                                   n_jobs=-1, scoring='mean_absolute_error', cv=kf)

In [15]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=KFold(n_splits=10, random_state=0, shuffle=False),
          error_score='raise',
          estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=50, n_jobs=-1,
          param_distributions={'subsample': array([ 0.7,  0.8,  0.9,  1. ]), 'n_estimators': array([ 50, 250, 450, 650, 850]), 'learning_rate': array([ 0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,  0.09]), 'max_depth': [8]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='mean_absolute_error',
          verbose=0)

In [16]:
best_params = random_search.cv_results_['params'][np.flatnonzero(random_search.cv_results_['rank_test_score'] == 1)[0]]
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: -0.050975 (std: 0.001572)
Parameters: {'subsample': 0.89999999999999991, 'learning_rate': 0.069999999999999993, 'max_depth': 8, 'n_estimators': 850}

Model with rank: 2
Mean validation score: -0.051037 (std: 0.001564)
Parameters: {'subsample': 0.89999999999999991, 'learning_rate': 0.059999999999999998, 'max_depth': 8, 'n_estimators': 850}

Model with rank: 3
Mean validation score: -0.051279 (std: 0.001560)
Parameters: {'subsample': 0.89999999999999991, 'max_depth': 8, 'learning_rate': 0.049999999999999996, 'n_estimators': 650}



In [17]:
scores = list()
# evaluate model with best alpha given by CV
xgb.set_params(**best_params)
for train_k, test_k in kf.split(x_train):
    xgb.fit(x_train[train_k],y_train[train_k])
    w_true_k = np.exp(y_train[test_k]) - 1
    w_pred_k = np.exp(xgb.predict(x_train[test_k])) - 1
    scores.append(mean_absolute_error(w_true_k, w_pred_k))
print('Weight error: %0.4f +- %0.4f' % (np.mean(scores),2*np.std(scores)))

Weight error: 0.2172 +- 0.0140


#### Fit whole train with best hyperparameters

In [18]:
xgb.fit(x_train,y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.069999999999999993, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=850, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True,
       subsample=0.89999999999999991)

In [19]:
w_true = np.exp(y_test) - 1
w_pred = np.exp(xgb.predict(x_test)) - 1
abs_error = mean_absolute_error(w_true, w_pred)
pct_error = abs_error / w_true
print('Test mean abs error: ', abs_error)
print('Mean relative error: %0.4f' % pct_error.mean())

Test mean abs error:  0.210557249449
Mean relative error: 0.0651


In [20]:
time.time() - start

438.4367208480835