In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [175]:
df = pd.read_csv("Bank Customer Churn Prediction.csv")

# Data Exploration

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isna().sum()[df.isna().sum() > 0]

In [None]:
df["gender"].unique()

In [None]:
df["country"].unique()

# correcting typos

In [None]:
df["gender"].replace({
    "Feme": "Female",
    "F": "Female",
    "Feale": "Female",
    "Femal": "Female",
    "Femmale": "Female",
    "Maale": "Male",
    "Maaaale": "Male",
    "Maa": "Male",
    "Mal e": "Male",
    "Malee": "Male"
    }, inplace=True)
df["gender"].unique()

In [None]:
['France', 'Spain', 'GERMANY', 'Paris', 'Germany', 'Gesrmany',
       'Franc', 'German', 'Fra_nce', 'GeMrmany', 'Fran', 'Sp']

In [None]:
df["country"].replace({
    "Paris": "France",
    "Franc" : "France",
    "Fra_nce" : "France",
    "Fran" : "France",
    "GERMANY" : "Germany",
    "Gesrmany": "Germany",
    "German" : "Germany",
    "GeMrmany" : "Germany",
    "Sp" : "Spain"
    }, inplace=True)

df["country"].unique()

# Handling NaN Values

Handle Missing Values:

Identify missing values in numeric and categorical columns.
Use appropriate strategies:
- Numeric columns: Fill with mean, median, or another statistical method.
- Categorical columns: Fill with mode or "Unknown."

In [None]:
df.isna().sum()[df.isna().sum() > 0]

In [186]:
# fill with median
df["credit_score"] = df.groupby("churn")["credit_score"].transform(
    lambda x: x.fillna(x.median()))

df["age"] = df.groupby("churn")["age"].transform(
    lambda x: x.fillna(x.median()))

# first replace nan with None because youll get an error in the next line when convertign to floats
df["estimated_salary"] = df["estimated_salary"].replace("no salary", None)
# convert row to floats
df["estimated_salary"] = df["estimated_salary"].astype(float)
df["estimated_salary"] = df.groupby("churn")["estimated_salary"].transform(
    lambda x: x.fillna(x.median()))

# fill with mode
df["products_number"] = df.groupby("churn")["products_number"].transform(
    lambda x: x.fillna(x.mode()[0]))

df["credit_card"] = df.groupby("churn")["credit_card"].transform(
    lambda x: x.fillna(x.mode()[0]))

df["active_member"] = df.groupby("churn")["active_member"].transform(
    lambda x: x.fillna(x.mode()[0]))

In [None]:
df.isna().sum()[df.isna().sum() > 0]

# Delete unnecessary columns

In [188]:
df.drop(columns=["customer_id"], inplace=True)

# Encoding

In [None]:
df.head(1)

In [190]:
cat = ["gender", "country"]
df = pd.get_dummies(df, columns=cat, dtype=int)

# Handling outliers

In [None]:
numerics = ["credit_score", "estimated_salaray", "age", "balance"]

sns.boxplot(df["age"])
plt.title('Age Box Plot - Outliers Detection')
plt.xlabel('Age')
plt.show()

In [None]:
age_counts = df["age"].value_counts().sort_index()
plt.figure(figsize=(10, 6))  
age_counts.plot(kind="bar")
plt.xticks(rotation=45, ha="right")  
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(df["estimated_salary"])

In [None]:
sns.boxplot(df["credit_score"])

In [None]:
df["balance"].min()

In [None]:
sns.boxplot(df["balance"])

In [None]:
# seems okay
df["age"].min()

In [198]:
# deleting outliers for age
outliers_age = df["age"] > 120

# deleting unlogical outliers 
df = df[~outliers_age]

**- Credit score, balance and estimated salary are already fine and no more actions have to be taken**

# Train test split

In [199]:
y = df["churn"]
X = df.drop(columns=["churn"])

In [200]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

# Scaling

In [None]:
df["products_number"].unique()

In [None]:
df["products_number"].replace({"np": "0"}, inplace=True)

In [208]:
# MinMax because there are no negative values
scaler = MinMaxScaler()

numerics = ["credit_score", "estimated_salary", "age", "balance"]

X_train[numerics] = scaler.fit_transform(X_train[numerics])
X_test[numerics] = scaler.transform(X_test[numerics])

In [209]:
le = LabelEncoder()
X_train["products_number"] = le.fit_transform(X_train["products_number"])

In [210]:
X_test["products_number"] = le.transform(X_test["products_number"])

# Model Training

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred = rf.predict(X_test)

print(classification_report(y_test, pred))