In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
pd.set_option("chop", 200)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
df1 = pd.read_csv('../input/playground-series-s4e1/sample_submission.csv')
df2 = pd.read_csv('../input/playground-series-s4e1/test.csv')
df3 = pd.read_csv('../input/playground-series-s4e1/train.csv')

# Display the first few rows of each dataframe

In [None]:
print("First few rows of df1:")
print(df1.head())

In [None]:
print("\nFirst few rows of df2:")
print(df2.head())

In [None]:
print("\nFirst few rows of df3:")
print(df3.head())

In [None]:
# Combining the dataframes based on the common column (e.g., 'id' or 'CustomerId')
merged_df = pd.merge(df2, df1, on='id', how='inner')

In [None]:
merged_df.head()

In [None]:
merged_df.info

# EDA
* Explore your datasets (df1, df2, df3). 
* Understand the distribution of features, check for missing values, and visualize relationships between different features. 
* Pay special attention to the distribution of the target variable (Exited).

In [None]:
# Drop irrelevant columns (if any)
merged_df = merged_df.dropna(how="any", subset=["Geography", "Age", "HasCrCard", "IsActiveMember"])

In [None]:
# Check for missing values
print("Missing values in the merged dataframe:")
print(merged_df.isnull().sum())

In [None]:
# Summary statistics of numerical features
print("\nSummary statistics of numerical features:")
print(merged_df.describe())

In [None]:
# Distribution of the target variable 'Exited'
plt.figure(figsize=(6, 4))
sns.countplot(x='Exited', data=merged_df)
plt.title('Distribution of Exited')
plt.show()

In [None]:
# Visualize the distribution of numerical features
num_cols = merged_df.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(15, 10))
merged_df[num_cols].hist(bins=20, figsize=(15, 10))
plt.suptitle('Distribution of Numerical Features')
plt.show()

In [None]:
# Visualize the distribution of categorical features
cat_cols = merged_df.select_dtypes(include=['object']).columns
plt.figure(figsize=(15, 8))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(2, 2, i)
    sns.countplot(x=col, data=merged_df)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Tenure Processing
'Tenure' represents the number of years a customer has been with the bank. If there are any negative values in 'Tenure', we'll assume they are errors and set them to zero. 

In [None]:
merged_df['Tenure'] = merged_df['Tenure'].apply(lambda x: max(x, 0))

# Data Prerocessing
* Handle missing values and outliers.
* Convert categorical variables (like 'Geography' and 'Gender') into numerical representations (one-hot encoding, label encoding).
* Standardize or normalize numerical features if needed.
* Check for imbalanced classes (if one class significantly outnumbers the other, you might need to balance it).

In [None]:
merged_df.head()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming merged_df is your dataframe with the data

# Define column types for preprocessing
numeric_cols = ["CreditScore", "Age", "Tenure", "Balance", "EstimatedSalary"]
ohe_cols = [
    "Geography",
    "Gender",
    "NumOfProducts",
    "HasCrCard",
    "IsActiveMember",
]

# Separate numerical and categorical columns
numeric_data = merged_df[numeric_cols]
categorical_data = merged_df[ohe_cols]

# Standardize numerical features
scaler = StandardScaler()
numeric_data_scaled = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_cols)

# One-hot encode categorical features
categorical_data_encoded = pd.get_dummies(categorical_data, columns=ohe_cols, drop_first=True)

# Concatenate numerical and one-hot encoded categorical features
X = pd.concat([numeric_data_scaled, categorical_data_encoded], axis=1)

# Define the target variable 'Exited'
y = merged_df['Exited']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ML Model

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# Assuming X and y are your features and target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a simple ANN model
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Convert y_test to integers
y_test = y_test.astype(int)

# Evaluate the model on the test set
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int)

# Print evaluation metrics
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Class distribution in y_test:", y_test.value_counts())

# THE END