# Predicting CERFRANCE EBE Trajectories

In [None]:
import sys
import dataiku
import numpy as np
import pandas as pd
import sklearn as sk
import dataiku.core.pandasutils as pdu
from dataiku.doctor.preprocessing import PCA
from collections import defaultdict, Counter

And tune pandas display options:

In [None]:
pd.set_option('display.width', 3000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

#### Importing base data

The first step is to get our machine learning dataset:

In [None]:
# We apply the preparation that you defined. You should not modify this.
preparation_steps = []
preparation_output_schema = {u'userModified': False, u'columns': [{u'type': u'string', u'name': u'CERFRANCE'}, {u'type': u'double', u'name': u'charg123_avg'}, {u'type': u'double', u'name': u'charg123_stddev'}, {u'type': u'double', u'name': u'charg126_avg'}, {u'type': u'double', u'name': u'charg126_stddev'}, {u'type': u'double', u'name': u'marg469_avg'}, {u'type': u'double', u'name': u'marg469_stddev'}, {u'type': u'double', u'name': u'marg747_avg'}, {u'type': u'double', u'name': u'marg747_stddev'}, {u'type': u'double', u'name': u'marg750_avg'}, {u'type': u'double', u'name': u'marg750_stddev'}, {u'type': u'double', u'name': u'1_charg123'}, {u'type': u'double', u'name': u'1_charg126'}, {u'type': u'double', u'name': u'1_marg469'}, {u'type': u'double', u'name': u'1_marg747'}, {u'type': u'double', u'name': u'1_marg750'}, {u'type': u'bigint', u'name': u'traj_ebe_cice_new'}, {u'type': u'double', u'name': u'evolution_ddb_agri_n_n3'}, {u'type': u'double', u'name': u'evolution_n_n3_ecofi23'}, {u'type': u'bigint', u'name': u'notif_dev_new'}, {u'type': u'double', u'name': u'evolution_ddb_acs_n_n3'}]}

ml_dataset_handle = dataiku.Dataset('RECUP_ECO_ALERT_NOTIFICATION_prepared')
ml_dataset_handle.set_preparation_steps(preparation_steps, preparation_output_schema)
%time ml_dataset = ml_dataset_handle.get_dataframe(limit = 100000)

print ('Base data has %i rows and %i columns' % (ml_dataset.shape[0], ml_dataset.shape[1]))
# Five first records",
ml_dataset.head(5)

#### Initial data management

In [None]:
## Variable selection
ml_dataset = ml_dataset[[u'marg469_avg', u'1_marg469', u'charg126_stddev', u'1_charg126', u'marg469_stddev', u'charg126_avg', u'charg123_stddev', u'marg750_stddev', u'1_charg123', u'1_marg747', u'traj_ebe_cice_new', u'marg747_avg', u'charg123_avg', u'marg747_stddev', u'1_marg750', u'marg750_avg']]

Let's first coerce categorical columns into unicode, numerical features into floats.

In [None]:
# astype('unicode') does not work as expected

def coerce_to_unicode(x):
    if sys.version_info < (3, 0):
        if isinstance(x, str):
            return unicode(x,'utf-8')
        else:
            return unicode(x)
    else:
        return str(x)


categorical_features = []
numerical_features = [u'marg469_avg', u'1_marg469', u'charg126_stddev', u'1_charg126', u'marg469_stddev', u'charg126_avg', u'charg123_stddev', u'marg750_stddev', u'1_charg123', u'1_marg747', u'marg747_avg', u'charg123_avg', u'marg747_stddev', u'1_marg750', u'marg750_avg']
text_features = []
from dataiku.doctor.utils import datetime_to_epoch
for feature in categorical_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in text_features:
    ml_dataset[feature] = ml_dataset[feature].apply(coerce_to_unicode)
for feature in numerical_features:
    if ml_dataset[feature].dtype == np.dtype('M8[ns]') or (hasattr(ml_dataset[feature].dtype, 'base') and ml_dataset[feature].dtype.base == np.dtype('M8[ns]')):
        ml_dataset[feature] = datetime_to_epoch(ml_dataset[feature])
    else:
        ml_dataset[feature] = ml_dataset[feature].astype('double')

We are now going to handle the target variable and store it in a new variable:

In [None]:
target_map = {u'1': 1, u'2': 0}
ml_dataset['__target__'] = ml_dataset['traj_ebe_cice_new'].map(str).map(target_map)
del ml_dataset['traj_ebe_cice_new']


# Remove rows for which the target is unknown.
ml_dataset = ml_dataset[~ml_dataset['__target__'].isnull()]

#### Cross-validation strategy

In [None]:
train, test = pdu.split_train_valid(ml_dataset, prop=0.8)
print ('Train data has %i rows and %i columns' % (train.shape[0], train.shape[1]))
print ('Test data has %i rows and %i columns' % (test.shape[0], test.shape[1]))

#### Features preprocessing

The first thing to do at the features level is to handle the missing values.
Let's reuse the settings defined in the model

In [None]:
drop_rows_when_missing = []
impute_when_missing = [{'impute_with': u'MEAN', 'feature': u'marg469_avg'}, {'impute_with': u'MEAN', 'feature': u'1_marg469'}, {'impute_with': u'MEAN', 'feature': u'charg126_stddev'}, {'impute_with': u'MEAN', 'feature': u'1_charg126'}, {'impute_with': u'MEAN', 'feature': u'marg469_stddev'}, {'impute_with': u'MEAN', 'feature': u'charg126_avg'}, {'impute_with': u'MEAN', 'feature': u'charg123_stddev'}, {'impute_with': u'MEAN', 'feature': u'marg750_stddev'}, {'impute_with': u'MEAN', 'feature': u'1_charg123'}, {'impute_with': u'MEAN', 'feature': u'1_marg747'}, {'impute_with': u'MEAN', 'feature': u'marg747_avg'}, {'impute_with': u'MEAN', 'feature': u'charg123_avg'}, {'impute_with': u'MEAN', 'feature': u'marg747_stddev'}, {'impute_with': u'MEAN', 'feature': u'1_marg750'}, {'impute_with': u'MEAN', 'feature': u'marg750_avg'}]

# Features for which we drop rows with missing values"
for feature in drop_rows_when_missing:
    train = train[train[feature].notnull()]
    test = test[test[feature].notnull()]
    print ('Dropped missing records in %s' % feature)

# Features for which we impute missing values"
for feature in impute_when_missing:
    if feature['impute_with'] == 'MEAN':
        v = train[feature['feature']].mean()
    elif feature['impute_with'] == 'MEDIAN':
        v = train[feature['feature']].median()
    elif feature['impute_with'] == 'CREATE_CATEGORY':
        v = 'NULL_CATEGORY'
    elif feature['impute_with'] == 'MODE':
        v = train[feature['feature']].value_counts().index[0]
    elif feature['impute_with'] == 'CONSTANT':
        v = feature['value']
    train[feature['feature']] = train[feature['feature']].fillna(v)
    test[feature['feature']] = test[feature['feature']].fillna(v)
    print ('Imputed missing values in feature %s with value %s' % (feature['feature'], coerce_to_unicode(v)))

We can now handle the categorical features (still using the settings defined in Models):

Let's rescale numerical features

In [None]:
rescale_features = {u'marg469_avg': u'AVGSTD', u'1_marg469': u'AVGSTD', u'charg126_stddev': u'AVGSTD', u'marg469_stddev': u'AVGSTD', u'charg123_avg': u'AVGSTD', u'charg126_avg': u'AVGSTD', u'charg123_stddev': u'AVGSTD', u'marg750_stddev': u'AVGSTD', u'1_charg123': u'AVGSTD', u'1_charg126': u'AVGSTD', u'marg747_avg': u'AVGSTD', u'1_marg747': u'AVGSTD', u'marg747_stddev': u'AVGSTD', u'1_marg750': u'AVGSTD', u'marg750_avg': u'AVGSTD'}
for (feature_name, rescale_method) in rescale_features.items():
    if rescale_method == 'MINMAX':
        _min = train[feature_name].min()
        _max = train[feature_name].max()
        scale = _max - _min
        shift = _min
    else:
        shift = train[feature_name].mean()
        scale = train[feature_name].std()
    if scale == 0.:
        del train[feature_name]
        del test[feature_name]
        print ('Feature %s was dropped because it has no variance' % feature_name)
    else:
        print ('Rescaled %s' % feature_name)
        train[feature_name] = (train[feature_name] - shift).astype(np.float64) / scale
        test[feature_name] = (test[feature_name] - shift).astype(np.float64) / scale

#### Modeling

Before actually creating our model, we need to split the datasets into their features and labels parts:

In [None]:
train_X = train.drop('__target__', axis=1)
test_X = test.drop('__target__', axis=1)

train_Y = np.array(train['__target__'])
test_Y = np.array(test['__target__'])

Now we can finally create our model !

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(
                    n_estimators=30,
                    random_state=1337,
                    max_depth=10,
                    min_samples_leaf=3,
                    verbose=2
                   )

In [None]:
### Training my Model

%time clf.fit(train_X, train_Y)

The model is now trained, we can apply it to our test set:

In [None]:
### Evaluation on the Test Set

%time _predictions = clf.predict(test_X)
%time _probas = clf.predict_proba(test_X)
predictions = pd.Series(data=_predictions, index=test_X.index, name='predicted_value')
cols = [
    u'probability_of_value_%s' % label
    for (_, label) in sorted([(int(target_map[label]), label) for label in target_map])
]
probabilities = pd.DataFrame(data=_probas, index=test_X.index, columns=cols)

# Build scored dataset
results_test = test_X.join(predictions, how='left')
results_test = results_test.join(probabilities, how='left')
results_test = results_test.join(test['__target__'], how='left')
results_test = results_test.rename(columns= {'__target__': 'traj_ebe_cice_new'})

#### Results

In [None]:
from dataiku.doctor.utils.metrics import mroc_auc_score
test_Y_ser = pd.Series(test_Y)
print ('AUC value:', mroc_auc_score(test_Y_ser, _probas))

In [None]:
#Reversing the mapping to display initial labels
inv_map = { target_map[label] : label for label in target_map}
predictions.map(inv_map)