In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l1_l2

In [93]:
df_train = pd.read_csv('Datasets/train.csv') # using pandas to read the test set
df_train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [94]:
df_train.drop(columns=['id','CustomerId','Surname'],inplace=True)
df_train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [95]:
string_cols = ['Geography','Gender']

df_train = pd.get_dummies(df_train,columns=string_cols,dtype=float)
df_train.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0,1.0,0.0,0.0,0.0,1.0
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,0,1.0,0.0,0.0,0.0,1.0
2,678,40.0,10,0.0,2,1.0,0.0,184866.69,0,1.0,0.0,0.0,0.0,1.0
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1.0,0.0,0.0,0.0,1.0
4,716,33.0,5,0.0,2,1.0,1.0,15068.83,0,0.0,0.0,1.0,0.0,1.0


In [96]:
robust = ['CreditScore','Age','Balance','NumOfProducts']
standard = ['Tenure']
minmax = ['EstimatedSalary']

In [97]:
scalers = {}

for col in df_train.columns:
    if col in robust:
        scalers.update({col:RobustScaler()})
        df_train[col] = scalers[col].fit_transform(df_train[col].values.reshape(-1,1))
    elif col in standard:
        scalers.update({col:StandardScaler()})
        df_train[col] = scalers[col].fit_transform(df_train[col].values.reshape(-1,1))
    elif col in minmax:
        scalers.update({col:MinMaxScaler()})
        df_train[col] = scalers[col].fit_transform(df_train[col].values.reshape(-1,1))
    else:
        continue

In [98]:
df_train.drop(columns=['Gender_Male','Geography_Spain'],inplace=True)

In [99]:
df_train['Balance'] = df_train.Balance.apply(lambda x: 0 if x == 0 else 1)
df_train.head(5)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Gender_Female
0,0.079646,-0.4,-0.719973,0,0.0,1.0,0.0,0.907279,0,1.0,0.0,0.0
1,-0.283186,-0.4,-1.432694,0,0.0,1.0,1.0,0.247483,0,1.0,0.0,0.0
2,0.168142,0.3,1.774548,0,0.0,1.0,0.0,0.924364,0,1.0,0.0,0.0
3,-0.690265,-0.3,-1.076334,1,-1.0,1.0,1.0,0.422787,0,1.0,0.0,0.0
4,0.504425,-0.4,-0.007253,0,0.0,1.0,1.0,0.075293,0,0.0,0.0,0.0


In [108]:
drops = ['Exited','Geography_France']

In [109]:
X = df_train.drop(columns=drops)
y = df_train.Exited

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=0.005)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['AUC'])
    return model

input_dim = X_train.shape[1]

model = create_model(input_dim)
model.fit(X_train, y_train, epochs=12, batch_size=20, validation_split=0.2, verbose=1)

y_pred = model.predict(X_test).flatten()
roc_auc = roc_auc_score(y_test, y_pred)

print(f'ROC AUC: {roc_auc}')


Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
ROC AUC: 0.8849182874353065


In [53]:
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['AUC'])
    return model

# Obtener la dimensión de entrada
input_dim = X.shape[1]

# Crear y entrenar el modelo de red neuronal
model = create_model(input_dim)
model.fit(X, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1)

# Realizar predicciones en el conjunto de prueba y calcular el AUC

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x23d9b4bea90>

In [54]:
df_test = pd.read_csv('Datasets/test.csv')

df_test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.00,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.00,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.00,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.00,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,570,Spain,Male,29.0,7,116099.82,1,1.0,1.0,148087.62
110019,275053,15774133,Cox,575,France,Female,36.0,4,178032.53,1,1.0,1.0,42181.68
110020,275054,15728456,Ch'iu,712,France,Male,31.0,2,0.00,2,1.0,0.0,16287.38
110021,275055,15687541,Yegorova,709,France,Female,32.0,3,0.00,1,1.0,1.0,158816.58


In [55]:
df_test = pd.get_dummies(df_test,columns=string_cols,dtype=int)
df_test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,165034,15773898,Lucchese,586,23.0,2,0.00,2,0.0,1.0,160976.75,1,0,0,1,0
1,165035,15782418,Nott,683,46.0,2,0.00,1,1.0,0.0,72549.27,1,0,0,1,0
2,165036,15807120,K?,656,34.0,7,0.00,2,1.0,0.0,138882.09,1,0,0,1,0
3,165037,15808905,O'Donnell,681,36.0,8,0.00,1,1.0,0.0,113931.57,1,0,0,0,1
4,165038,15607314,Higgins,752,38.0,10,121263.62,1,1.0,0.0,139431.00,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,570,29.0,7,116099.82,1,1.0,1.0,148087.62,0,0,1,0,1
110019,275053,15774133,Cox,575,36.0,4,178032.53,1,1.0,1.0,42181.68,1,0,0,1,0
110020,275054,15728456,Ch'iu,712,31.0,2,0.00,2,1.0,0.0,16287.38,1,0,0,0,1
110021,275055,15687541,Yegorova,709,32.0,3,0.00,1,1.0,1.0,158816.58,1,0,0,1,0


In [56]:
df_test.drop(columns=['Gender_Male','Geography_Spain'],inplace=True)
df_test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Gender_Female
0,165034,15773898,Lucchese,586,23.0,2,0.00,2,0.0,1.0,160976.75,1,0,1
1,165035,15782418,Nott,683,46.0,2,0.00,1,1.0,0.0,72549.27,1,0,1
2,165036,15807120,K?,656,34.0,7,0.00,2,1.0,0.0,138882.09,1,0,1
3,165037,15808905,O'Donnell,681,36.0,8,0.00,1,1.0,0.0,113931.57,1,0,0
4,165038,15607314,Higgins,752,38.0,10,121263.62,1,1.0,0.0,139431.00,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,570,29.0,7,116099.82,1,1.0,1.0,148087.62,0,0,0
110019,275053,15774133,Cox,575,36.0,4,178032.53,1,1.0,1.0,42181.68,1,0,1
110020,275054,15728456,Ch'iu,712,31.0,2,0.00,2,1.0,0.0,16287.38,1,0,0
110021,275055,15687541,Yegorova,709,32.0,3,0.00,1,1.0,1.0,158816.58,1,0,1


In [57]:
for col in df_test.columns:
    if col in robust:
        df_test[col] = scalers[col].fit_transform(df_test[col].values.reshape(-1,1))
    elif col in standard:
        df_test[col] = scalers[col].fit_transform(df_test[col].values.reshape(-1,1))
    elif col in minmax:
        df_test[col] = scalers[col].fit_transform(df_test[col].values.reshape(-1,1))
    else:
        continue

df_test

Unnamed: 0,id,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Gender_Female
0,165034,15773898,Lucchese,-0.654867,-1.4,-1.067887,0.000000,0.0,0.0,1.0,0.804903,1,0,1
1,165035,15782418,Nott,0.203540,0.9,-1.067887,0.000000,-1.0,1.0,0.0,0.362723,1,0,1
2,165036,15807120,K?,-0.035398,-0.3,0.713922,0.000000,0.0,1.0,0.0,0.694419,1,0,1
3,165037,15808905,O'Donnell,0.185841,-0.1,1.070284,0.000000,-1.0,1.0,0.0,0.569654,1,0,0
4,165038,15607314,Higgins,0.814159,0.1,1.783008,1.009306,-1.0,1.0,0.0,0.697164,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110018,275052,15662091,P'eng,-0.796460,-0.8,0.713922,0.966326,-1.0,1.0,1.0,0.740451,0,0,0
110019,275053,15774133,Cox,-0.752212,-0.1,-0.355164,1.481806,-1.0,1.0,1.0,0.210871,1,0,1
110020,275054,15728456,Ch'iu,0.460177,-0.6,-1.067887,0.000000,0.0,1.0,0.0,0.081387,1,0,0
110021,275055,15687541,Yegorova,0.433628,-0.5,-0.711526,0.000000,-1.0,1.0,1.0,0.794101,1,0,1


In [58]:
df_test.drop(columns=['CustomerId','Surname'],inplace=True)

In [60]:
df_test.head()

Unnamed: 0,id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Gender_Female
0,165034,-0.654867,-1.4,-1.067887,0.0,0.0,0.0,1.0,0.804903,1,0,1
1,165035,0.20354,0.9,-1.067887,0.0,-1.0,1.0,0.0,0.362723,1,0,1
2,165036,-0.035398,-0.3,0.713922,0.0,0.0,1.0,0.0,0.694419,1,0,1
3,165037,0.185841,-0.1,1.070284,0.0,-1.0,1.0,0.0,0.569654,1,0,0
4,165038,0.814159,0.1,1.783008,1.009306,-1.0,1.0,0.0,0.697164,0,1,0


In [None]:
drops.append('id')
drops

In [61]:

X_test_final = df_test.drop(columns='id')
X_test_final

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Gender_Female
0,-0.654867,-1.4,-1.067887,0.000000,0.0,0.0,1.0,0.804903,1,0,1
1,0.203540,0.9,-1.067887,0.000000,-1.0,1.0,0.0,0.362723,1,0,1
2,-0.035398,-0.3,0.713922,0.000000,0.0,1.0,0.0,0.694419,1,0,1
3,0.185841,-0.1,1.070284,0.000000,-1.0,1.0,0.0,0.569654,1,0,0
4,0.814159,0.1,1.783008,1.009306,-1.0,1.0,0.0,0.697164,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
110018,-0.796460,-0.8,0.713922,0.966326,-1.0,1.0,1.0,0.740451,0,0,0
110019,-0.752212,-0.1,-0.355164,1.481806,-1.0,1.0,1.0,0.210871,1,0,1
110020,0.460177,-0.6,-1.067887,0.000000,0.0,1.0,0.0,0.081387,1,0,0
110021,0.433628,-0.5,-0.711526,0.000000,-1.0,1.0,1.0,0.794101,1,0,1


In [62]:
y_pred = model.predict(X_test_final)
y_pred



array([[0.02400761],
       [0.7802103 ],
       [0.03674738],
       ...,
       [0.01805219],
       [0.1722654 ],
       [0.23129211]], dtype=float32)

In [63]:
y_pred = y_pred.flatten()
y_pred

array([0.02400761, 0.7802103 , 0.03674738, ..., 0.01805219, 0.1722654 ,
       0.23129211], dtype=float32)

In [64]:
columna_id = df_test['id']

In [66]:
dictionario = {'id':columna_id,'Exited':y_pred}

In [67]:
df_entrega = pd.DataFrame(dictionario)
df_entrega

Unnamed: 0,id,Exited
0,165034,0.024008
1,165035,0.780210
2,165036,0.036747
3,165037,0.242255
4,165038,0.394676
...,...,...
110018,275052,0.057175
110019,275053,0.098193
110020,275054,0.018052
110021,275055,0.172265


In [68]:
df_entrega.to_csv('Datasets/submissionDL.csv',index=False)

In [69]:
! kaggle competitions submit playground-series-s4e1 -f Datasets/submissionDL.csv -m "My deeplearning submission"

Successfully submitted to Binary Classification with a Bank Churn Dataset 



  0%|          | 0.00/2.01M [00:00<?, ?B/s]
  1%|          | 16.0k/2.01M [00:00<02:06, 16.6kB/s]
 19%|█▉        | 400k/2.01M [00:01<00:03, 511kB/s]  
 38%|███▊      | 784k/2.01M [00:01<00:01, 1.03MB/s]
 53%|█████▎    | 1.06M/2.01M [00:01<00:00, 1.10MB/s]
 64%|██████▎   | 1.28M/2.01M [00:01<00:00, 1.14MB/s]
 73%|███████▎  | 1.47M/2.01M [00:01<00:00, 1.27MB/s]
100%|██████████| 2.01M/2.01M [00:01<00:00, 2.02MB/s]
100%|██████████| 2.01M/2.01M [00:03<00:00, 703kB/s] 
