# Research Questions:
## What customers from this European bank are most likely to churn?
### Correlation between different variables and churn rates.

* How does the churn rate vary across different demographic groups (gender, age)?
* What consumers are most likely to churn based on their credit score?
* What consumers are most likely to churn based on their estimated salary?

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import scipy.stats as stats

## 1. Data Understanding

In [None]:
df = pd.read_csv('/kaggle/input/bank-churn-modelling/Churn_Modelling.csv')
df.head(3)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df[df['Exited'] == 1]['Age'].sort_index().plot(kind = 'hist', bins = 50)


## 2. Data Prep
* Dropping irrelevant columns
* Identifying duplicate rows (if any)

In [None]:
# Filter for only customers who still are members
# df = df[df['IsActiveMember'] == 1]
df.shape

In [None]:
df.drop([
    'RowNumber',
    'Surname',
    'Tenure',
    'CustomerId'
], axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
df = df[['Gender', 'Age', 'Geography', 'CreditScore', 'Balance', 'EstimatedSalary', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Exited']]

In [None]:
df.head(3)

In [None]:
df.isnull().sum()

In [None]:
df.loc[df.duplicated()]

## 3. Data Analysis
### GENDER:

In [None]:
churn_percentages_gender = df.groupby('Gender')['Exited'].mean() * 100

plot_data_gender = pd.DataFrame({
    'Gender': churn_percentages_gender.index,
    'ChurnRate': churn_percentages_gender.values
})

plt.figure(figsize=(10, 6))
plt.bar(plot_data_gender['Gender'], plot_data_gender['ChurnRate'])
plt.title("Gender vs Churn Rate")
plt.xlabel("Gender")
plt.ylabel("Churn Rate (%)")
plt.show()

In [None]:
churn_rate_male = df[df['Gender'] == 'Male']['Exited']
churn_rate_female = df[df['Gender'] == 'Female']['Exited']

# Perform a two-sample t-test
t_statistic, p_value = stats.ttest_ind(churn_rate_male, churn_rate_female, equal_var=False)

# Output the results
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Check significance
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in churn rates between men and women.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in churn rates between men and women.")


## AGE:

In [None]:
# Filters the df for all ages that appear at least 10 times
indices = df['Age'].value_counts()[df['Age'].value_counts() > 10].index
age_filtered_df = df[df['Age'].isin(indices)]

# Calculate the churn percentage per age
churn_percentages_age = age_filtered_df.groupby('Age')['Exited'].mean() * 100

# Create df with ages and churn percentages
plot_data_age = pd.DataFrame({
    'Age': churn_percentages_age.index,
    'ChurnRate': churn_percentages_age.values
})

x = plot_data_age['Age']
y = plot_data_age['ChurnRate']

# Scatterplot
plt.subplots(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.title("Age vs Churn Rate")
plt.xlabel("Age")
plt.ylabel("Churn Rate (%)")

#Regression line
c = np.polyfit(x, y, 1)
y_fit = c[0] * x + c[1]
plt.plot(x, y_fit, '-r', label=f'Regression Line: {c[0]:.2f} * Age + {c[1]:.2f}')
plt.legend()

plt.show()

In [None]:
x = plot_data_age['Age']
y = plot_data_age['ChurnRate']

# Perform linear regression
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

# Output the results
print("Slope (c[0]):", slope)
print("Intercept (c[1]):", intercept)
print("P-value:", p_value)

# Check significance
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant linear relationship between age and churn rate.")
else:
    print("Fail to reject the null hypothesis: There is no significant linear relationship between age and churn rate.")

## CREDIT SCORE:
Correlation between credit score (cs) and churn rate

In [None]:
indices = df['CreditScore'].value_counts()[df['CreditScore'].value_counts() > 10].index
cs_filtered_df = df[df['CreditScore'].isin(indices)]


churn_percentages_cs = cs_filtered_df.groupby('CreditScore')['Exited'].mean() * 100

#Create df with CS and churn rates
plot_data_cs = pd.DataFrame({
    'RoundedCreditScore': churn_percentages_cs.index,
    'ChurnRate': churn_percentages_cs.values
})

x = plot_data_cs['RoundedCreditScore']
y = plot_data_cs['ChurnRate']

# Scatterplot
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.xlabel('Credit Score (CS)')
plt.ylabel('Churn Rate (%)')
plt.title('Credit Score vs Churn Rate')
plt.grid(True)

# Regression line
c = np.polyfit(x, y, 1)
y_fit = c[0] * x + c[1]
plt.plot(x, y_fit, '-r', label=f'Regression Line: {c[0]:.4f} * CS + {c[1]:.2f}')
plt.legend()

plt.show()

## SALARY:
Correlation between estimated salary and churn rate

In [None]:
# Round salaries to the nearest thousand
df['RoundedSalary'] = df['EstimatedSalary'].round(-3)
df['SalaryInThousands'] = df['RoundedSalary'] / 1000

# Calculate the percentage of churns for each rounded salary
churn_percentages_salary = df.groupby('SalaryInThousands')['Exited'].mean() * 100

# Create a DataFrame for plotting
plot_data = pd.DataFrame({
    'RoundedSalary': churn_percentages_salary.index,
    'ChurnRate': churn_percentages_salary.values
})

x = plot_data['RoundedSalary']
y = plot_data['ChurnRate']

# Scatterplot
plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.5)
plt.xlabel('Salary (in thousands)')
plt.ylabel('Churn Rate (%)')
plt.title('Salary vs Churn Rate')
plt.grid(True)

# Regression line
c = np.polyfit(x, y, 1)
y_fit = c[0] * x + c[1]
plt.plot(x, y_fit, '-r', label=f'Churn Rate = {c[0]:.4f} * Salary + {c[1]:.2f}')
plt.legend()

plt.show()