# CUSTOMER CHURN PREDICTION

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the dataset
file_path = "C:/Users/admin/Desktop/customer.csv"
df = pd.read_csv(file_path)
# Check column names
print(df.columns)

# Inspect the first few rows of the DataFrame
print(df.head())


Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   


In [2]:
# Load the dataset
file_path = "C:/Users/admin/Desktop/customer.csv"
df = pd.read_csv(file_path)

In [3]:
# Check column names
print(df.columns)


Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')


In [4]:
# Inspect the first few rows of the DataFrame
print(df.head())


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [5]:
# Split the data into features (X) and the target variable (y)
X = df.drop('Exited', axis=1)
y = df['Exited']

In [6]:
# Define categorical and numerical columns
categorical_cols = ['Geography', 'Gender']
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']


In [7]:
# Preprocessing for numerical data (you can add more steps if needed)
numerical_transformer = StandardScaler()


In [8]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [9]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
# Define the model
model = RandomForestClassifier(random_state=42)


In [11]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)


In [14]:
# Preprocessing of validation data, get predictions
y_pred = clf.predict(X_test)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8635
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.75      0.46      0.57       393

    accuracy                           0.86      2000
   macro avg       0.81      0.71      0.75      2000
weighted avg       0.85      0.86      0.85      2000



In [16]:
new_customer_data = pd.DataFrame({
    'CreditScore': [750],
    'Geography': ['France'],
    'Gender': ['Male'],
    'Age': [40],
    'Tenure': [5],
    'Balance': [50000],
    'NumOfProducts': [2],
    'HasCrCard': [1],
    'IsActiveMember': [1],
    'EstimatedSalary': [60000]
})


In [17]:
# Use the trained model to predict if the new customer will stay or leave
new_customer_prediction = clf.predict(new_customer_data)

In [18]:
# Interpret the prediction
if new_customer_prediction[0] == 1:
    print("The new customer is predicted to leave the bank.")
else:
    print("The new customer is predicted to stay with the bank.")

The new customer is predicted to stay with the bank.


In [19]:
new_customer_data = pd.DataFrame({
    'CreditScore': [376],
    'Geography': ['Germany'],
    'Gender': ['Male'],
    'Age': [29],
    'Tenure': [4],
    'Balance': [115046.74],
    'NumOfProducts': [2],
    'HasCrCard': [1],
    'IsActiveMember': [0],
    'EstimatedSalary': [119346.88]
})

# Use the trained model to predict if the new customer will stay or leave
new_customer_prediction = clf.predict(new_customer_data)

# Interpret the prediction
if new_customer_prediction[0] == 1:
    print("The new customer is predicted to leave the bank.")
else:
    print("The new customer is predicted to stay with the bank.")



The new customer is predicted to stay with the bank.


In [20]:
# Use the trained model to predict if the new customer will stay or leave
new_customer_prediction = clf.predict(new_customer_data)

In [21]:
# Interpret the prediction
if new_customer_prediction[0] == 1:
    print("The new customer is predicted to leave the bank.")
else:
    print("The new customer is predicted to stay with the bank.")



The new customer is predicted to stay with the bank.
