Import

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

Data sebelumnya dilakukan dengan join semua table berdasarkan id nya, dari mulai category, education, marital, dan status, ke table customer_data_history. sehingga menghasilkan csv `CLEAN_customer_data_history`.

In [None]:
df = pd.read_csv("CLEAN_customer_data_history.csv")
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df.drop(["Clientnum"], axis=1)

In [None]:
object_columns = df.select_dtypes(include="object").columns

for column in object_columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}:")
    print(unique_values)
    print()

Feature Engineering

In [None]:
df["Status_int"] = df["Status"].replace(
    {"Existing Customer": 1, "Attrited Customer": 0}
)

In [None]:
age_ranges = {
    "Gen Z": (0, 24),
    "Millennial": (25, 40),
    "Gen X": (41, 55),
    "Baby Boomer": (56, 75),
    "Silent Generation": (76, 100),
}

df["Generation"] = pd.cut(
    df["Customer Age"],
    bins=[range[0] - 1 for range in age_ranges.values()]
    + [max(range[1] for range in age_ranges.values()) + 1],
    labels=[generation for generation in age_ranges.keys()],
)

EDA

In [None]:
# Count the number of 'Attrited Customer' and 'Existing Customer'
attrited_count = df[df["Status"] == "Attrited Customer"].shape[0]
existing_count = df[df["Status"] == "Existing Customer"].shape[0]

# Create labels and counts for the pie chart
labels = ["Attrited Customer", "Existing Customer"]
counts = [attrited_count, existing_count]

# Create the pie chart
plt.figure(figsize=(6, 6))
plt.pie(counts, labels=labels, autopct="%1.1f%%", startangle=90)
plt.title("Distribution of Customer Status")
plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

In [None]:
df.Clientnum

In [None]:
# Filter the dataframe to include only columns of numerical data type
# numerical_columns = df.select_dtypes(include='number').columns
numerical_columns = [
    "Avg Open To Buy",
    "Avg Utilization Ratio",
    "Contacts Count 12 mon",
    "Credit Limit",
    "Customer Age",
    "Dependent count",
    "Months Inactive 12 mon",
    "Months on book",
    "Total Relationship Count",
    "Total Revolving Bal",
    "Total Trans Amt",
    "Total Trans Ct",
]

# Set the number of columns for subplots
num_cols = 3

# Calculate the number of rows needed based on the number of numerical columns and the number of columns for subplots
num_rows = (len(numerical_columns) + num_cols - 1) // num_cols

# Create subplots for histogram plots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Iterate over numerical columns and create histogram plots with KDE
for i, col in enumerate(numerical_columns):
    ax = axes[i]  # Select the current subplot
    sns.histplot(data=df, x=col, hue="Status", kde=True, ax=ax)
    ax.set_title(f"Distribution of {col} by Status")
    ax.set_xlabel(col)
    ax.set_ylabel("Count")

# Remove any unused subplots
if len(numerical_columns) < num_rows * num_cols:
    for j in range(len(numerical_columns), num_rows * num_cols):
        fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Select the columns for feature analysis
selected_columns = [
    "Generation",
    "Card Category",
    "Education Level",
    "Gender",
    "Income Category",
    "Marital Status",
]

# Calculate the summary statistics of selected features
feature_summary = df[selected_columns].describe(include="all")
feature_summary

In [None]:
# # Visualize the distribution of numeric features
# numeric_columns = ['Avg Open To Buy', 'Avg Utilization Ratio', 'Credit Limit', 'Customer Age', 'Dependent count',
#                    'Months Inactive 12 mon', 'Months on book', 'Total Relationship Count', 'Total Revolving Bal',
#                    'Total Trans Amt', 'Total Trans Ct']
# sns.set(style='ticks')
# sns.pairplot(df[numeric_columns])
# plt.show()

In [None]:
categorical_columns = [
    "Generation",
    "Card Category",
    "Education Level",
    "Gender",
    "Income Category",
    "Marital Status",
]
num_plots = len(categorical_columns)
num_rows = 2
num_cols = 3

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

# Flatten the axes array to easily iterate over subplots
axes = axes.flatten()

for i, col in enumerate(categorical_columns):
    ax = axes[i]
    sns.countplot(x=col, data=df, hue="Status", ax=ax)
    ax.set_title(f"Distribution of {col}")
    ax.legend(title="Status")
    ax.set_xlabel(col)
    ax.set_ylabel("Count")
    ax.tick_params(axis="x", rotation=45)  # Rotate x-axis labels by 45 degrees

# Remove any unused subplots
if num_plots < num_rows * num_cols:
    for j in range(num_plots, num_rows * num_cols):
        fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# # Visualize the distribution of categorical features
# categorical_columns = ['Generation', 'Card Category', 'Education Level', 'Gender', 'Income Category', 'Marital Status']
# for col in categorical_columns:
#     plt.figure(figsize=(8, 6))
#     sns.countplot(x=col, data=df, hue='Status')
#     plt.title(f'Distribution of {col}')
#     plt.show()

In [None]:
def compare_kdeplots(df, col, hue=None):
    plt.figure(figsize=(8, 6))
    sns.histplot(
        df[df["Status"] == "Existing Customer"][col],
        label="Existing Customer",
        kde=True,
        hue=hue,
    )
    sns.histplot(
        df[df["Status"] == "Attrited Customer"][col],
        label="Attrited Customer",
        kde=True,
        hue=hue,
    )
    plt.title(f"Distribution of {col} by Customer Status")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.legend()
    plt.show()

In [None]:
for col in ["Months Inactive 12 mon", "Total Trans Ct", "Avg Utilization Ratio"]:
    compare_kdeplots(df, col)