In [1]:
%matplotlib inline

import sys
sys.path.append('..')
import os
from time import time
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import mean_absolute_error, mean_squared_error

from utils.pipeline_classes import (DataFrameSelectColumns,
                                    DataFrameConvertNumeric,
                                    DataFrameConvertOneHotEncoding)

from utils.dataset_functions import (dump_df_to_csv,
                                     build_train_valid_set)

from random_forest_functions import (filter_train_valid_sets_on_homogeneous_categ_features,
                                     build_X_y_arrays,
                                     preprocess,
                                     train_model_regressor,
                                     predict,
                                     plot_true_vs_pred,
                                     results_report,
                                     plot_feature_importances)

####  Input

In [2]:
target_feature = 'nb_ami_avant_30'

categorical_feat_list = [
    #'dc_typeaffichage_id',
    #'dc_codepostallieutravail',
    #'be19',
    'departement',
    #'domaine_pro',
    #'dc_rome_id',
    'dc_appelationrome_id',
    'dc_typesalaire',
    #'dc_typexperienceprof_id',
    'dc_typecontrat_id',
    'dc_typeservicerecrutement_1_id',
    'dc_modepreselection_id',
    'dc_categorie_contrat',
    'dc_categorie_experience',
    #'dc_categorie_dureetravailhebdoheures',
    #'dc_specificite1',
    #'dc_specificite2',
    #'dc_specificite3',
    #'nivfor',
    #'cluster_labels',
    'dc_trancheeffectifetab',
    'dc_idcdifficulteeconomique_id',
    #'mois'
]

numerical_feat_list = [
    'salaire_annuel_normalise',
    'ict1',
    'ict3',
    'ict10',
    'prin1',
    'prin2',
    'prin3',
    #'q33',
    #'q66',

]

In [3]:
# Load datasets
path = '../data/'
filename = 'dataset.csv'
split_percent = 0.8
df = pd.read_csv(os.path.join(path, filename), sep='|', dtype='unicode', error_bad_lines=False)
df.nb_ami_avant_30 = df.nb_ami_avant_30.astype(int)
df.at[df.nb_ami_avant_30 > 30, 'nb_ami_avant_30'] = 30

In [None]:
# train / valid sets
train_df, valid_df = build_train_valid_set(df, split_percent)
train_df, valid_df = filter_train_valid_sets_on_homogeneous_categ_features(categorical_feat_list, train_df, valid_df)

X_train, y_train = build_X_y_arrays(train_df, target_feature, categorical_feat_list, numerical_feat_list)
X_valid, y_valid = build_X_y_arrays(valid_df, target_feature, categorical_feat_list, numerical_feat_list)

#### Preprocess features

In [5]:
num_pipeline = Pipeline([
    ('col_selector', DataFrameSelectColumns(numerical_feat_list)),
    ('num_encoder', DataFrameConvertNumeric()),
])
    
cat_pipeline = Pipeline([
    ('col_selector', DataFrameSelectColumns(categorical_feat_list)),
    ('ohe_encoder', DataFrameConvertOneHotEncoding())
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

X_train, X_valid = preprocess(X_train, X_valid, full_pipeline)

#### Train models

In [6]:
start = time()
n_estimators = 10
n_jobs = None
regr_fit = train_model_regressor(X_train, y_train, n_estimators, n_jobs)
print(f'Training done in {round((time() - start) / 60)} min.')

Training done in 15 min.


#### Predict on valid set

In [7]:
y_pred = predict(regr_fit, X_valid)
y_pred = y_pred.astype(int)
best_mae = int(mean_absolute_error(y_valid, y_pred))
best_mse = int(mean_squared_error(y_valid, y_pred))
print(f'mae: {best_mae}, mse: {best_mse}')
#plot_true_vs_pred(y_valid, y_pred)

mae: 6, mse: 80


In [8]:
y_true = y_valid
import numpy as np

In [9]:
y_true = pd.DataFrame(y_true, columns=['true'])
y_pred = pd.DataFrame(y_pred, columns=['pred'])
result_df = y_true.join(y_pred)

exact_preds = result_df[result_df.pred == result_df.true].pred
exact_zeros = result_df[result_df.sum(axis=1) == 0]
true_zeros = result_df[result_df.true == 0]
print(f'Exact preds ratio: {round(len(exact_preds) / len(result_df) * 100)}% ({len(exact_preds)} / {len(result_df)})')
print(f'Exact preds distrib:  median: {int(exact_preds.median())} - Q75: {int(np.percentile(exact_preds, 75))} - Q90: {int(np.percentile(exact_preds, 90))}')
print(f'Exact zeros ratio: {round(len(exact_zeros) / len(true_zeros) * 100)}%')  
not_pred_zeros = result_df[(result_df.true == 0) & (result_df.pred != 0)].pred
diff_true_pred = result_df.true - result_df.pred
pos_diff_true_pred = diff_true_pred[diff_true_pred > 0]
neg_diff_true_pred = - diff_true_pred[diff_true_pred < 0]
print(f'True > Pred: count: {len(pos_diff_true_pred)} - median: {int(pos_diff_true_pred.median())} - Q75: {int(np.percentile(pos_diff_true_pred, 75))} - Q90: {int(np.percentile(pos_diff_true_pred, 90))}')
print(f'True < Pred: count: {len(neg_diff_true_pred)} -  median: {int(neg_diff_true_pred.median())} -  Q75: {int(np.percentile(neg_diff_true_pred, 75))} - Q90: {int(np.percentile(neg_diff_true_pred, 90))}')


Exact preds ratio: 12% (4788 / 39690)
Exact preds distrib:  median: 1 - Q75: 3 - Q90: 7
Exact zeros ratio: 40%
True > Pred: count: 24810 - median: 6 - Q75: 12 - Q90: 19
True < Pred: count: 10092 -  median: 3 -  Q75: 6 - Q90: 10
