In [90]:
# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression

# Set random seed 
RSEED = 42
warnings.filterwarnings("ignore")

In [91]:
import pickle
with open('../data/data_test.pkl', 'rb') as f:
    data = pickle.load(f)

In [92]:
data.keys()

dict_keys(['X_train', 'X_test', 'y_train', 'y_test', 'df_combined', 'df_combined_encoded', 'X_train_resampled', 'y_train_resampled'])

In [93]:
X_train = data["X_train"]
X_test = data["X_test"]
y_train = data["y_train"]
y_test =  data["y_test"]
df_combined =  data["df_combined"]
df_combined_encoded = data["df_combined_encoded"]
X_train_resampled = data["X_train_resampled"]
y_train_resampled = data["y_train_resampled"]

In [94]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (104496, 106)
X_test shape: (26124, 106)
y_train shape: (104496,)
y_test shape: (26124,)


In [95]:
X_train.head()

Unnamed: 0_level_0,creation_date,months_number,invoice_date,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,district_62,district_63,...,counter_code_506.0,counter_code_532.0,counter_code_565.0,counter_code_600.0,reading_remarque_7.0,reading_remarque_8.0,reading_remarque_9.0,reading_remarque_203.0,reading_remarque_413.0,counter_type_GAZ
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
train_Client_41801,1034553600000000000,4.0,1349831314285714176,1.0,165.970238,38.883929,5.860119,0.0,True,False,...,False,False,False,False,False,False,False,False,False,False
train_Client_59742,1164758400000000000,3.783784,1364842118918918912,1.0,235.464286,466.564286,37.6,87.128571,False,False,...,False,False,False,False,False,False,True,False,False,False
train_Client_59302,1281484800000000000,5.473684,1402012800000000000,1.0,83.514423,14.788462,0.0,0.0,False,False,...,False,False,False,False,False,False,True,False,False,False
train_Client_123486,836956800000000000,5.16129,1326345909677419264,1.0,145.09375,96.18125,4.0875,0.0,False,False,...,False,False,False,False,False,False,False,False,False,False
train_Client_43223,1241049600000000000,4.428571,1329566400000000000,1.0,75.443548,0.0,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False


In [96]:
X_train.dtypes

creation_date               int64
months_number             float64
invoice_date                int64
counter_coefficient       float64
consommation_level_1      float64
                           ...   
reading_remarque_8.0         bool
reading_remarque_9.0         bool
reading_remarque_203.0       bool
reading_remarque_413.0       bool
counter_type_GAZ             bool
Length: 106, dtype: object

In [97]:
num_features = list(df_combined_encoded.columns[df_combined_encoded.dtypes == 'float64'])
num_features.remove('target')
num_features

['months_number',
 'counter_coefficient',
 'consommation_level_1',
 'consommation_level_2',
 'consommation_level_3',
 'consommation_level_4']

In [98]:
cat_features = list(df_combined_encoded.columns[df_combined_encoded.dtypes == 'bool'])
cat_features

['district_62',
 'district_63',
 'district_69',
 'client_catg_12',
 'client_catg_51',
 'region_103',
 'region_104',
 'region_105',
 'region_106',
 'region_107',
 'region_199',
 'region_206',
 'region_301',
 'region_302',
 'region_303',
 'region_304',
 'region_305',
 'region_306',
 'region_307',
 'region_308',
 'region_309',
 'region_310',
 'region_311',
 'region_312',
 'region_313',
 'region_371',
 'region_372',
 'region_379',
 'region_399',
 'tarif_type_10.0',
 'tarif_type_11.0',
 'tarif_type_12.0',
 'tarif_type_13.0',
 'tarif_type_14.0',
 'tarif_type_15.0',
 'tarif_type_21.0',
 'tarif_type_24.0',
 'tarif_type_29.0',
 'tarif_type_30.0',
 'tarif_type_40.0',
 'tarif_type_45.0',
 'counter_statue_1',
 'counter_statue_2',
 'counter_statue_3',
 'counter_statue_4',
 'counter_statue_5',
 'counter_statue_618',
 'counter_statue_269375',
 'counter_statue_0',
 'counter_statue_1',
 'counter_statue_4',
 'counter_statue_5',
 'counter_code_5.0',
 'counter_code_10.0',
 'counter_code_16.0',
 'counter_c

### Preprocessing Pipeline

In [99]:
#from sklearn.pipeline import Pipeline

# Pipline for standardization of features
num_pipeline = Pipeline([
    #('imputer_num', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
#cat_pipeline = Pipeline([
    #('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
    #('1hot', OneHotEncoder(handle_unknown='ignore'))
#])

In [100]:
#from sklearn.compose import ColumnTransformer

# Complete pipeline for numerical and categorical features
# 'ColumnTransformer' applies transformers (num_pipeline/ cat_pipeline)
# to specific columns of an array or DataFrame (num_features/cat_features)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    #('cat', cat_pipeline, cat_features),
])

In [101]:
# Building a full pipeline with our preprocessor and a LogisticRegression Classifier
pipe_logreg = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000))
])

In [102]:
y_train_pred = cross_val_predict(pipe_logreg, X_train, y_train, cv=5)

In [105]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores y_train:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train, y_train_pred)))
print("Recall: {:.2f}".format(recall_score(y_train, y_train_pred)))
print("Precision: {:.2f}".format(precision_score(y_train, y_train_pred)))
print("AUC: {:.2f}".format(roc_auc_score(y_train, y_train_pred)))


Cross validation scores y_train:
-------------------------
Accuracy: 0.95
Recall: 0.00
Precision: 0.00
AUC: 0.50


In [106]:
y_train_resampled_pred = cross_val_predict(pipe_logreg, X_train_resampled, y_train_resampled, cv=5)

In [109]:
# Calculating the accuracy for the LogisticRegression Classifier 
print('Cross validation scores y_train_resampled:')
print('-------------------------')
print("Accuracy: {:.2f}".format(accuracy_score(y_train_resampled, y_train_resampled_pred)))
print("Recall: {:.2f}".format(recall_score(y_train_resampled, y_train_resampled_pred)))
print("Precision: {:.2f}".format(precision_score(y_train_resampled, y_train_resampled_pred)))
print("AUC: {:.2f}".format(roc_auc_score(y_train_resampled, y_train_resampled_pred)))

Cross validation scores y_train_resampled:
-------------------------
Accuracy: 0.60
Recall: 0.34
Precision: 0.71
AUC: 0.60
