### Codificación de las variables categoricas, escalado y modelo

In [1]:
import pandas as pd 
import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import metrics
# conda install category_encoders
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, \
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve, accuracy_score, roc_auc_score, \
                            classification_report, confusion_matrix

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix

In [2]:
df_fraud_train = pd.read_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
df_fraud_test  = pd.read_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")

In [3]:
df_fraud_train.columns


Index(['Unnamed: 0.1', 'Unnamed: 0', 'customer_age', 'payment_type',
       'zip_count_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'has_other_cards',
       'foreign_request', 'source', 'device_os', 'keep_alive_session',
       'device_fraud_count', 'month', 'fraud_bool',
       'proposed_credit_limit_input', 'prev_address_months_count_input',
       'session_length_in_minutes_input', 'velocity_24h_input',
       'bank_months_count_input', 'current_address_months_count_input',
       'income_input', 'velocity_6h_input', 'intended_balcon_amount_input',
       'velocity_4w_input', 'days_since_request_input',
       'device_distinct_emails_8w_input', 'name_email_similarity_input'],
      dtype='object')

In [4]:
df_fraud_train.dtypes

Unnamed: 0.1                            int64
Unnamed: 0                              int64
customer_age                            int64
payment_type                           object
zip_count_4w                            int64
bank_branch_count_8w                    int64
date_of_birth_distinct_emails_4w        int64
employment_status                      object
credit_risk_score                       int64
email_is_free                           int64
housing_status                         object
phone_home_valid                        int64
phone_mobile_valid                      int64
has_other_cards                         int64
foreign_request                         int64
source                                 object
device_os                              object
keep_alive_session                      int64
device_fraud_count                      int64
month                                   int64
fraud_bool                              int64
proposed_credit_limit_input       

### Codificación de la variable objetivo

In [5]:
df_fraud_train['fraud_bool'].value_counts()

0    791177
1      8823
Name: fraud_bool, dtype: int64

Observamos que hay 791.177 instancias de 'Legit' y8823200 instancias de 'Fraud'.

In [6]:
df_fraud_train['fraud_bool'].fillna(0, inplace=True)  # Reemplazar NaN con 0
df_fraud_train['fraud_bool'] = df_fraud_train['fraud_bool'].replace([np.inf, -np.inf], 0)  # Reemplazar inf con 0

# 2. Convertir la columna a tipo entero
df_fraud_train['fraud_bool'] = df_fraud_train['fraud_bool'].astype(int)

In [7]:
df_fraud_train['fraud_bool'].value_counts()


0    791177
1      8823
Name: fraud_bool, dtype: int64

In [8]:
X_train = df_fraud_train.drop('fraud_bool',axis=1)
X_test = df_fraud_test.drop('fraud_bool',axis=1)
y_train = df_fraud_train['fraud_bool']
y_test = df_fraud_test['fraud_bool']

### Codificación del resto de variables categoricas


Hemos codificado las variables categóricas con one-hot-encoding. De esta formas estarán clasificadas con valores numéricos (1, 2, 3, etc.) en función de la categoría en la que se encuentre.

In [9]:
list_columns_cat = list(X_train.select_dtypes("object", "category").columns)
list_other = list(set(X_train.columns)-set(list_columns_cat))

In [10]:
ohe = ce.OneHotEncoder(cols=list_columns_cat)
model = ohe.fit(X_train, y_train)

In [11]:
model

In [12]:
X_train_t = model.transform(X_train, y_train)
X_test_t = model.transform(X_test, y_test)

In [13]:
len(list(X_train_t.columns))

54

In [14]:
X_train_t.dtypes.to_dict()


{'Unnamed: 0.1': dtype('int64'),
 'Unnamed: 0': dtype('int64'),
 'customer_age': dtype('int64'),
 'payment_type_1': dtype('int64'),
 'payment_type_2': dtype('int64'),
 'payment_type_3': dtype('int64'),
 'payment_type_4': dtype('int64'),
 'payment_type_5': dtype('int64'),
 'zip_count_4w': dtype('int64'),
 'bank_branch_count_8w': dtype('int64'),
 'date_of_birth_distinct_emails_4w': dtype('int64'),
 'employment_status_1': dtype('int64'),
 'employment_status_2': dtype('int64'),
 'employment_status_3': dtype('int64'),
 'employment_status_4': dtype('int64'),
 'employment_status_5': dtype('int64'),
 'employment_status_6': dtype('int64'),
 'employment_status_7': dtype('int64'),
 'credit_risk_score': dtype('int64'),
 'email_is_free': dtype('int64'),
 'housing_status_1': dtype('int64'),
 'housing_status_2': dtype('int64'),
 'housing_status_3': dtype('int64'),
 'housing_status_4': dtype('int64'),
 'housing_status_5': dtype('int64'),
 'housing_status_6': dtype('int64'),
 'housing_status_7': dtype(