# Bootcamp Data Science y MLOps

<img src="https://i.ibb.co/5RM26Cw/LOGO-COLOR2.png" width="500px">

Creado en [escueladedatosvivos.ai](https://escueladedatosvivos.ai) 🚀.

¿Consultas? En la página tenés soporte por IA guiada, comunidad y el acceso a certificación.

<br>

---  

# 1) Cargamos los datos

In [1]:
import numpy as np
import pandas as pd
import funpymodeling
import matplotlib.pyplot as plt

In [2]:
# Para visualizar todas las columnas
pd.set_option('display.max_columns', None)

In [3]:
df_data = pd.read_csv('../data/customer_dataset.csv', sep=',')

In [4]:
print(df_data.shape)
print(df_data.columns)
display(df_data.head(5))

(653, 13)
Index(['fraudulent', 'transactionAmount', 'transactionFailed', 'orderAmount',
       'orderState', 'paymentMethodRegistrationFailure', 'paymentMethodType',
       'paymentMethodProvider', 'paymentMethodIssuer', 'emailDomain',
       'emailProvider', 'customerIPAddressSimplified', 'sameCity'],
      dtype='object')


Unnamed: 0,fraudulent,transactionAmount,transactionFailed,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,emailDomain,emailProvider,customerIPAddressSimplified,sameCity
0,False,18.0,False,18.0,pending,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
1,False,26.0,False,26.0,fulfilled,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
2,True,45.0,False,45.0,fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,no
3,True,23.0,False,23.0,fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,yes
4,True,43.0,True,43.0,fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,com,yahoo,only_letters,no


In [5]:
display(df_data.dtypes)

fraudulent                             bool
transactionAmount                   float64
transactionFailed                    object
orderAmount                         float64
orderState                           object
paymentMethodRegistrationFailure     object
paymentMethodType                    object
paymentMethodProvider                object
paymentMethodIssuer                  object
emailDomain                          object
emailProvider                        object
customerIPAddressSimplified          object
sameCity                             object
dtype: object

In [6]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,fraudulent,0,0.0,386,0.591118,2,bool
1,transactionAmount,30,0.045942,0,0.0,67,float64
2,transactionFailed,30,0.045942,455,0.696784,2,object
3,orderAmount,30,0.045942,0,0.0,67,float64
4,orderState,30,0.045942,0,0.0,3,object
5,paymentMethodRegistrationFailure,12,0.018377,559,0.856049,2,object
6,paymentMethodType,12,0.018377,0,0.0,4,object
7,paymentMethodProvider,12,0.018377,0,0.0,10,object
8,paymentMethodIssuer,12,0.018377,0,0.0,20,object
9,emailDomain,0,0.0,0,0.0,6,object


In [51]:
df_data['emailDomain'].value_counts(dropna=False)

com      500
biz       48
org       32
weird     18
info      15
net       10
Name: emailDomain, dtype: int64

In [52]:
df_data['emailProvider'].value_counts(dropna=False)

other      278
gmail      121
yahoo      118
hotmail     88
weird       18
Name: emailProvider, dtype: int64

In [53]:
df_data['customerIPAddressSimplified'].value_counts(dropna=False)

only_letters          366
digits_and_letters    257
Name: customerIPAddressSimplified, dtype: int64

In [54]:
df_data['sameCity'].value_counts(dropna=False)

yes        389
no         157
unknown     77
Name: sameCity, dtype: int64

In [7]:
df_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
fraudulent,653.0,2.0,False,386.0,,,,,,,
transactionAmount,623.0,,,,34.598716,20.360247,10.0,21.0,34.0,45.0,353.0
transactionFailed,623.0,2.0,False,455.0,,,,,,,
orderAmount,623.0,,,,34.598716,20.360247,10.0,21.0,34.0,45.0,353.0
orderState,623.0,3.0,fulfilled,516.0,,,,,,,
paymentMethodRegistrationFailure,641.0,2.0,False,559.0,,,,,,,
paymentMethodType,641.0,4.0,card,489.0,,,,,,,
paymentMethodProvider,641.0,10.0,JCB 16 digit,119.0,,,,,,,
paymentMethodIssuer,641.0,20.0,Her Majesty Trust,69.0,,,,,,,
emailDomain,653.0,6.0,com,520.0,,,,,,,


Verificamos si las columnas `transactionAmount` y `orderAmount` contienen los mismos datos

In [10]:
df_data['transactionAmount'].equals(df_data['orderAmount'])  # Returns True

True

Como contiene los mismo valores, vamos a dropear una de ellas. Vamos a eliminar la columna `transactionAmount`.

In [11]:
print(df_data.shape)
df_data = df_data.drop(["transactionAmount"], axis=1)
print(df_data.shape)

(653, 13)
(653, 12)


# 2) Discretizacion

In [12]:
df_data_1 = df_data.copy()

## 2.1) Por igual frecuencia y por igual rango

Vamos a analizar la columna `orderAmount`

In [13]:
df_data['orderAmount'], saved_bins_order_amount = pd.qcut(
    df_data['orderAmount'],
    q=5,
    duplicates='drop',
    retbins=True,
)

In [14]:
df_data['orderAmount']

0      (9.999, 18.0]
1       (18.0, 30.0]
2       (38.0, 47.0]
3       (18.0, 30.0]
4       (38.0, 47.0]
           ...      
648     (18.0, 30.0]
649     (18.0, 30.0]
650     (18.0, 30.0]
651     (18.0, 30.0]
652     (18.0, 30.0]
Name: orderAmount, Length: 653, dtype: category
Categories (5, interval[float64, right]): [(9.999, 18.0] < (18.0, 30.0] < (30.0, 38.0] < (38.0, 47.0] < (47.0, 353.0]]

In [15]:
funpymodeling.freq_tbl(df_data['orderAmount'])

Unnamed: 0,orderAmount,frequency,percentage,cumulative_perc
0,"(38.0, 47.0]",134,0.205207,0.215088
1,"(9.999, 18.0]",132,0.202144,0.426966
2,"(18.0, 30.0]",129,0.19755,0.634029
3,"(30.0, 38.0]",116,0.177642,0.820225
4,"(47.0, 353.0]",112,0.171516,1.0


Guardamos los puntos de corte

In [16]:
import pickle

with open('../data/saved_bins_order_amount.pkl', 'wb') as handle:
    pickle.dump(saved_bins_order_amount, handle, protocol=pickle.HIGHEST_PROTOCOL)

Vamos a evaluar que los puntos de corte / bins generado, se hayan guardado correctamente:

In [17]:
with open('../data/saved_bins_order_amount.pkl', 'rb') as handle:
    new_saved_bins_order_amount = pickle.load(handle)

In [18]:
new_saved_bins_order_amount

array([ 10.,  18.,  30.,  38.,  47., 353.])

In [19]:
df_data_1['orderAmount']

0      18.0
1      26.0
2      45.0
3      23.0
4      43.0
       ... 
648    25.0
649    25.0
650    25.0
651    19.0
652    27.0
Name: orderAmount, Length: 653, dtype: float64

In [20]:
df_data_1['orderAmount'] = pd.cut(
    df_data_1['orderAmount'],
    bins=new_saved_bins_order_amount, 
    include_lowest=True,  # importante para que coincidan todos
)

In [21]:
df_data_1.head(5)

Unnamed: 0,fraudulent,transactionFailed,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,emailDomain,emailProvider,customerIPAddressSimplified,sameCity
0,False,False,"(9.999, 18.0]",pending,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
1,False,False,"(18.0, 30.0]",fulfilled,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
2,True,False,"(38.0, 47.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,no
3,True,False,"(18.0, 30.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,yes
4,True,True,"(38.0, 47.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,com,yahoo,only_letters,no


# 4) Preparacion de Datos

## 4.1) Intepretar los valores

### paymentMethodIssuer

Contamos la frecuencia de los elementos únicos que aparecen en la columna `paymentMethodIssuer`

In [22]:
df_data['paymentMethodIssuer'].value_counts()

Her Majesty Trust           69
His Majesty Bank Corp.      67
Vertex Bancorp              63
Fountain Financial Inc.     61
Rose Bancshares             60
Bulwark Trust Corp.         60
Bastion Banks               58
Grand Credit Corporation    57
Citizens First Banks        55
Solace Banks                43
B                           10
c                            8
e                            8
r                            7
x                            4
a                            4
                             2
n                            2
o                            2
p                            1
Name: paymentMethodIssuer, dtype: int64

In [23]:
# Reemplazar un valor a la vez
weird_payment_method = ["B", "e", "c", "r", " ", "n", "x", "o", "a", "p"]
for payment_method in weird_payment_method:
    df_data['paymentMethodIssuer'] = df_data['paymentMethodIssuer'].replace(payment_method, 'weird')

In [24]:
df_data['paymentMethodIssuer'].value_counts()

Her Majesty Trust           69
His Majesty Bank Corp.      67
Vertex Bancorp              63
Fountain Financial Inc.     61
Bulwark Trust Corp.         60
Rose Bancshares             60
Bastion Banks               58
Grand Credit Corporation    57
Citizens First Banks        55
weird                       48
Solace Banks                43
Name: paymentMethodIssuer, dtype: int64

### paymentMethodProvider

Contamos la frecuencia de los elementos únicos que aparecen en la columna `paymentMethodProvider`

In [25]:
df_data['paymentMethodProvider'].value_counts()

JCB 16 digit                   119
VISA 16 digit                   99
Maestro                         71
Voyager                         63
VISA 13 digit                   60
Diners Club / Carte Blanche     59
American Express                45
JCB 15 digit                    45
Discover                        44
Mastercard                      36
Name: paymentMethodProvider, dtype: int64

### paymentMethodType

Contamos la frecuencia de los elementos únicos que aparecen en la columna `paymentMethodType`

In [26]:
df_data['paymentMethodType'].value_counts()

card         489
paypal        53
apple pay     50
bitcoin       49
Name: paymentMethodType, dtype: int64

### fraudulent

Contamos la frecuencia de los elementos únicos que aparecen en la columna `fraudulent`

In [27]:
df_data['fraudulent'].value_counts()

False    386
True     267
Name: fraudulent, dtype: int64

Vamos a hacer una especie de semáforo:
- False = Green le asignaremos el valor numérico 0
- True = Red le asignaremos el valor numérico 1

In [28]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,fraudulent,0,0.0,386,0.591118,2,bool
1,transactionFailed,30,0.045942,455,0.696784,2,object
2,orderAmount,30,0.045942,0,0.0,5,category
3,orderState,30,0.045942,0,0.0,3,object
4,paymentMethodRegistrationFailure,12,0.018377,559,0.856049,2,object
5,paymentMethodType,12,0.018377,0,0.0,4,object
6,paymentMethodProvider,12,0.018377,0,0.0,10,object
7,paymentMethodIssuer,12,0.018377,0,0.0,11,object
8,emailDomain,0,0.0,0,0.0,6,object
9,emailProvider,0,0.0,0,0.0,5,object


In [29]:
# Nos aseguramos que no exista ningún tipo de conflicto por falta de compatibilidad entre str y bool de True/False.

df_data['fraudulent'] = df_data['fraudulent'].astype(str)

In [30]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,fraudulent,0,0.0,0,0.0,2,object
1,transactionFailed,30,0.045942,455,0.696784,2,object
2,orderAmount,30,0.045942,0,0.0,5,category
3,orderState,30,0.045942,0,0.0,3,object
4,paymentMethodRegistrationFailure,12,0.018377,559,0.856049,2,object
5,paymentMethodType,12,0.018377,0,0.0,4,object
6,paymentMethodProvider,12,0.018377,0,0.0,10,object
7,paymentMethodIssuer,12,0.018377,0,0.0,11,object
8,emailDomain,0,0.0,0,0.0,6,object
9,emailProvider,0,0.0,0,0.0,5,object


In [31]:
df_data.head(5)

Unnamed: 0,fraudulent,transactionFailed,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,emailDomain,emailProvider,customerIPAddressSimplified,sameCity
0,False,False,"(9.999, 18.0]",pending,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
1,False,False,"(18.0, 30.0]",fulfilled,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
2,True,False,"(38.0, 47.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,no
3,True,False,"(18.0, 30.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,yes
4,True,True,"(38.0, 47.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,com,yahoo,only_letters,no


In [32]:
class_map = {'False': 0, 'True': 1}
df_data['fraudulent'] = df_data['fraudulent'].map(class_map)

In [33]:
df_data.head(5)

Unnamed: 0,fraudulent,transactionFailed,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,emailDomain,emailProvider,customerIPAddressSimplified,sameCity
0,0,False,"(9.999, 18.0]",pending,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
1,0,False,"(18.0, 30.0]",fulfilled,True,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
2,1,False,"(38.0, 47.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,no
3,1,False,"(18.0, 30.0]",fulfilled,False,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,yes
4,1,True,"(38.0, 47.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,com,yahoo,only_letters,no


## 4.2) Tratamientos de datos faltantes

Dropeamos las columnas con NaN ya que el impacto sobre la variable `fraudulent` es minimo.

In [35]:
df_data['fraudulent'].value_counts(normalize=False), df_data['fraudulent'].value_counts(normalize=True)

(0    386
 1    267
 Name: fraudulent, dtype: int64,
 0    0.591118
 1    0.408882
 Name: fraudulent, dtype: float64)

In [36]:
print(df_data.shape)
df_data.dropna(inplace=True)
print(df_data.shape)

(653, 12)
(623, 12)


In [37]:
df_data['fraudulent'].value_counts(normalize=False), df_data['fraudulent'].value_counts(normalize=True)

(0    366
 1    257
 Name: fraudulent, dtype: int64,
 0    0.58748
 1    0.41252
 Name: fraudulent, dtype: float64)

In [38]:
funpymodeling.status(df_data)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,fraudulent,0,0.0,366,0.58748,2,int64
1,transactionFailed,0,0.0,455,0.730337,2,object
2,orderAmount,0,0.0,0,0.0,5,category
3,orderState,0,0.0,0,0.0,3,object
4,paymentMethodRegistrationFailure,0,0.0,541,0.868379,2,object
5,paymentMethodType,0,0.0,0,0.0,4,object
6,paymentMethodProvider,0,0.0,0,0.0,10,object
7,paymentMethodIssuer,0,0.0,0,0.0,11,object
8,emailDomain,0,0.0,0,0.0,6,object
9,emailProvider,0,0.0,0,0.0,5,object


In [39]:
df_data['sameCity'].value_counts(normalize=False)

yes        389
no         157
unknown     77
Name: sameCity, dtype: int64

Convertimos algunas columnas

In [40]:
df_data['transactionFailed'] = df_data['transactionFailed'].astype(str)
df_data['paymentMethodRegistrationFailure'] = df_data['paymentMethodRegistrationFailure'].astype(str)

In [41]:
class_map = {'False': 0, 'True': 1}
df_data['transactionFailed'] = df_data['transactionFailed'].map(class_map)
df_data['paymentMethodRegistrationFailure'] = df_data['paymentMethodRegistrationFailure'].map(class_map)

In [42]:
df_data

Unnamed: 0,fraudulent,transactionFailed,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,emailDomain,emailProvider,customerIPAddressSimplified,sameCity
0,0,0,"(9.999, 18.0]",pending,1,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
1,0,0,"(18.0, 30.0]",fulfilled,1,card,JCB 16 digit,Citizens First Banks,com,yahoo,only_letters,yes
2,1,0,"(38.0, 47.0]",fulfilled,0,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,no
3,1,0,"(18.0, 30.0]",fulfilled,0,bitcoin,American Express,Bastion Banks,com,yahoo,only_letters,yes
4,1,1,"(38.0, 47.0]",fulfilled,1,bitcoin,VISA 16 digit,Solace Banks,com,yahoo,only_letters,no
...,...,...,...,...,...,...,...,...,...,...,...,...
648,0,1,"(18.0, 30.0]",fulfilled,0,card,VISA 13 digit,Vertex Bancorp,com,other,only_letters,yes
649,0,1,"(18.0, 30.0]",fulfilled,0,paypal,JCB 15 digit,Bastion Banks,com,other,only_letters,yes
650,0,0,"(18.0, 30.0]",fulfilled,0,card,VISA 13 digit,Vertex Bancorp,com,other,only_letters,yes
651,0,0,"(18.0, 30.0]",fulfilled,0,paypal,JCB 15 digit,Bastion Banks,com,other,only_letters,yes


# 5) One hot enconding

In [43]:
df_data_ohe = pd.get_dummies(df_data) 

In [44]:
df_data_ohe.head(5)

Unnamed: 0,fraudulent,transactionFailed,paymentMethodRegistrationFailure,"orderAmount_(9.999, 18.0]","orderAmount_(18.0, 30.0]","orderAmount_(30.0, 38.0]","orderAmount_(38.0, 47.0]","orderAmount_(47.0, 353.0]",orderState_failed,orderState_fulfilled,orderState_pending,paymentMethodType_apple pay,paymentMethodType_bitcoin,paymentMethodType_card,paymentMethodType_paypal,paymentMethodProvider_American Express,paymentMethodProvider_Diners Club / Carte Blanche,paymentMethodProvider_Discover,paymentMethodProvider_JCB 15 digit,paymentMethodProvider_JCB 16 digit,paymentMethodProvider_Maestro,paymentMethodProvider_Mastercard,paymentMethodProvider_VISA 13 digit,paymentMethodProvider_VISA 16 digit,paymentMethodProvider_Voyager,paymentMethodIssuer_Bastion Banks,paymentMethodIssuer_Bulwark Trust Corp.,paymentMethodIssuer_Citizens First Banks,paymentMethodIssuer_Fountain Financial Inc.,paymentMethodIssuer_Grand Credit Corporation,paymentMethodIssuer_Her Majesty Trust,paymentMethodIssuer_His Majesty Bank Corp.,paymentMethodIssuer_Rose Bancshares,paymentMethodIssuer_Solace Banks,paymentMethodIssuer_Vertex Bancorp,paymentMethodIssuer_weird,emailDomain_biz,emailDomain_com,emailDomain_info,emailDomain_net,emailDomain_org,emailDomain_weird,emailProvider_gmail,emailProvider_hotmail,emailProvider_other,emailProvider_weird,emailProvider_yahoo,customerIPAddressSimplified_digits_and_letters,customerIPAddressSimplified_only_letters,sameCity_no,sameCity_unknown,sameCity_yes
0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0
3,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
4,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0


In [45]:
df_data_ohe.columns

Index(['fraudulent', 'transactionFailed', 'paymentMethodRegistrationFailure',
       'orderAmount_(9.999, 18.0]', 'orderAmount_(18.0, 30.0]',
       'orderAmount_(30.0, 38.0]', 'orderAmount_(38.0, 47.0]',
       'orderAmount_(47.0, 353.0]', 'orderState_failed',
       'orderState_fulfilled', 'orderState_pending',
       'paymentMethodType_apple pay', 'paymentMethodType_bitcoin',
       'paymentMethodType_card', 'paymentMethodType_paypal',
       'paymentMethodProvider_American Express',
       'paymentMethodProvider_Diners Club / Carte Blanche',
       'paymentMethodProvider_Discover', 'paymentMethodProvider_JCB 15 digit',
       'paymentMethodProvider_JCB 16 digit', 'paymentMethodProvider_Maestro',
       'paymentMethodProvider_Mastercard',
       'paymentMethodProvider_VISA 13 digit',
       'paymentMethodProvider_VISA 16 digit', 'paymentMethodProvider_Voyager',
       'paymentMethodIssuer_Bastion Banks',
       'paymentMethodIssuer_Bulwark Trust Corp.',
       'paymentMethodIssuer_C

In [46]:
df_data_ohe_without_fraudulent = df_data_ohe.drop(["fraudulent"], axis=1)

In [47]:
df_data_ohe_without_fraudulent.head(5)

Unnamed: 0,transactionFailed,paymentMethodRegistrationFailure,"orderAmount_(9.999, 18.0]","orderAmount_(18.0, 30.0]","orderAmount_(30.0, 38.0]","orderAmount_(38.0, 47.0]","orderAmount_(47.0, 353.0]",orderState_failed,orderState_fulfilled,orderState_pending,paymentMethodType_apple pay,paymentMethodType_bitcoin,paymentMethodType_card,paymentMethodType_paypal,paymentMethodProvider_American Express,paymentMethodProvider_Diners Club / Carte Blanche,paymentMethodProvider_Discover,paymentMethodProvider_JCB 15 digit,paymentMethodProvider_JCB 16 digit,paymentMethodProvider_Maestro,paymentMethodProvider_Mastercard,paymentMethodProvider_VISA 13 digit,paymentMethodProvider_VISA 16 digit,paymentMethodProvider_Voyager,paymentMethodIssuer_Bastion Banks,paymentMethodIssuer_Bulwark Trust Corp.,paymentMethodIssuer_Citizens First Banks,paymentMethodIssuer_Fountain Financial Inc.,paymentMethodIssuer_Grand Credit Corporation,paymentMethodIssuer_Her Majesty Trust,paymentMethodIssuer_His Majesty Bank Corp.,paymentMethodIssuer_Rose Bancshares,paymentMethodIssuer_Solace Banks,paymentMethodIssuer_Vertex Bancorp,paymentMethodIssuer_weird,emailDomain_biz,emailDomain_com,emailDomain_info,emailDomain_net,emailDomain_org,emailDomain_weird,emailProvider_gmail,emailProvider_hotmail,emailProvider_other,emailProvider_weird,emailProvider_yahoo,customerIPAddressSimplified_digits_and_letters,customerIPAddressSimplified_only_letters,sameCity_no,sameCity_unknown,sameCity_yes
0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
1,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
2,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0
3,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1
4,1,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0


In [48]:
import pickle

with open('../data/categories_ohe_without_fraudulent.pkl', 'wb') as handle:
    pickle.dump(df_data_ohe_without_fraudulent.columns, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 6) Guardamos en dataset

Guardamos todo el One Hot Encoding, incluyendo nuestra variable objetivo `fraudulent`

In [49]:
df_data_ohe.to_csv("../data/ohe_customer_dataset.csv", sep=',', index=False)