# 1. Feature Engineering

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# 2. Data Reading

In [5]:
data = pd.read_csv(r'C:\Users\Abhishek\Documents\churn-rate-project\Data\train.csv')
data

Unnamed: 0,surname,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,Howard,727,Germany,Male,34,2,146407.11,1,1,1,72073.72,0
1,Chukwunonso,773,Spain,Female,39,4,0.00,2,0,1,182081.45,0
2,Chia,759,Spain,Male,31,8,0.00,2,1,1,99086.74,0
3,Kent,626,Germany,Male,35,4,88109.81,1,1,1,32825.50,0
4,Mazzi,686,France,Female,34,3,123971.51,2,1,0,147794.63,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1915,T'ang,632,France,Female,27,3,107375.82,1,1,1,62703.38,0
1916,Maclean,631,France,Male,33,7,0.00,1,1,1,58043.02,1
1917,Sun,651,Germany,Male,41,1,90218.11,1,1,0,174337.68,0
1918,Tucker,644,Germany,Female,44,8,106022.73,2,0,0,148727.42,0


# 3. Data Cleaning

In [9]:
data.drop([ 'surname'], axis=1, inplace=True)  # Drop unnecessary columns

In [11]:
X = data.drop('exited', axis=1)
y = data['exited']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4.Feature Encoding & Scalling

In [16]:
# Define numerical and categorical columns
numerical_cols = ['creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'estimatedsalary']
categorical_cols = ['geography', 'gender', 'hascrcard', 'isactivemember']

In [18]:
# Create pipelines for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

In [20]:
# Combine into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [22]:
# Create a pipeline to combine preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [24]:
# Train the model
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Accuracy: 0.85


In [28]:
# Save the model to a file
joblib.dump(model, 'model.pkl')

['model.pkl']

In [36]:
model = joblib.load('model.pkl')

In [50]:
# Define the column names as per your dataset during training
columns = ['creditscore', 'geography', 'gender', 'age', 'tenure', 'balance',
           'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary']

# Example test data (replace with actual test data or real-time input)
test_data = [[600, 'France', 'Male', 40, 3, 60000, 2, 1, 1, 50000]] # Example input format
test_df = pd.DataFrame(test_data,columns = columns)

In [54]:
test_ds = pd.read_csv(r'C:\Users\Abhishek\Documents\churn-rate-project\Data\test.csv')

In [None]:
test_ds.drop(['surname','exited'],axis= 1,inplace=True)

In [78]:
temp = pd.DataFrame([[600, 'France', 'Male', 40, 3, 60000, 2, 1, 1, 50000]],columns=['creditscore', 'geography', 'gender', 'age', 'tenure', 'balance',
           'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary'])

In [80]:
temp

Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [74]:
test_ds

Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary
0,453,Germany,Female,28,3,139986.65,1,1,0,136846.75
1,691,France,Female,29,9,0.00,2,0,0,199635.93
2,631,France,Male,32,10,0.00,2,0,1,196342.66
3,694,France,Male,47,4,0.00,2,1,0,197528.62
4,494,Spain,Male,67,5,0.00,2,1,1,85890.16
...,...,...,...,...,...,...,...,...,...,...
595,711,France,Male,64,4,0.00,2,1,1,3185.67
596,645,France,Male,32,4,0.00,2,0,1,97628.08
597,510,France,Female,26,6,136214.08,1,0,0,159742.33
598,620,Spain,Male,38,0,0.00,2,1,1,38015.34


In [82]:
prediction = model.predict(temp)
print(f"Prediction: {prediction}")

Prediction: [0]


In [86]:
prediction[0]

0