In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [5]:
url = 'https://raw.githubusercontent.com/vanshjaiswal/Machine-Learning-Projects/main/Project_Dataset/Churn_Modelling.csv'
data = pd.read_csv(url)
data.shape

(10002, 14)

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [4]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [9]:
data=data.dropna()

In [8]:
data.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [10]:
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9998 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        9998 non-null   int64  
 1   CustomerId       9998 non-null   int64  
 2   Surname          9998 non-null   object 
 3   CreditScore      9998 non-null   int64  
 4   Geography        9998 non-null   object 
 5   Gender           9998 non-null   object 
 6   Age              9998 non-null   float64
 7   Tenure           9998 non-null   int64  
 8   Balance          9998 non-null   float64
 9   NumOfProducts    9998 non-null   int64  
 10  HasCrCard        9998 non-null   float64
 11  IsActiveMember   9998 non-null   float64
 12  EstimatedSalary  9998 non-null   float64
 13  Exited           9998 non-null   int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [13]:
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [16]:
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Geography'] = le.fit_transform(data['Geography'])

In [17]:
X = data.drop('Exited', axis=1)
y = data['Exited']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [21]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [22]:
# Support Vector Machine (SVM)
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [25]:
# Logistic Regression Performance
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_lr))

# Random Forest Performance
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))

# SVM Performance
y_pred_svm = svm_model.predict(X_test)
print("SVM Report:\n", classification_report(y_test, y_pred_svm))

Logistic Regression Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.90      1620
           1       0.56      0.16      0.25       380

    accuracy                           0.82      2000
   macro avg       0.70      0.57      0.57      2000
weighted avg       0.78      0.82      0.77      2000

Random Forest Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.93      1620
           1       0.78      0.46      0.58       380

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.75      2000
weighted avg       0.87      0.87      0.86      2000

SVM Report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92      1620
           1       0.83      0.37      0.51       380

    accuracy                           0.87      2000
   macro avg       0.85      0.68      0.72      2000
weighted 

In [26]:
import pickle
with open('best_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)