In [1]:
import pandas as pd

data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
#check for null vals
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [3]:
#check for duplicate values
data[data.duplicated()]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited


In [4]:
#covert categorical vals to numerical vals
#used LabelEncoder for consistency
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data["Gender"] = label_encoder.fit_transform(data["Gender"])
data = pd.get_dummies(data,columns=["Geography"],drop_first=True)

In [6]:
#feature selection
features = ["CreditScore","Gender","Age","Tenure","Balance","NumOfProducts",
        "HasCrCard","IsActiveMember","EstimatedSalary","Geography_Germany","Geography_Spain"    ]

In [7]:
X=data[features]
y=data["Exited"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=50)

In [9]:
#feature scaling
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [10]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100,random_state=50)
model.fit(X_train,y_train)

In [11]:
#make predictions
pred=model.predict(X_test)

In [12]:
#evaluate
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
matrix = confusion_matrix(y_test,pred)
report = classification_report(y_test,pred)
accuracy = accuracy_score(y_test,pred)

In [13]:
print(matrix)
print(report)
print(accuracy)

[[1528   70]
 [ 215  187]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.91      1598
           1       0.73      0.47      0.57       402

    accuracy                           0.86      2000
   macro avg       0.80      0.71      0.74      2000
weighted avg       0.85      0.86      0.84      2000

0.8575


In [14]:
from sklearn.linear_model import LogisticRegression

s = LogisticRegression(random_state=50)
s.fit(X_train,y_train)

s_pred = s.predict(X_test)
r_matrix = confusion_matrix(y_test,s_pred)
r_report = classification_report(y_test,s_pred)
r_accuracy = accuracy_score(y_test,s_pred)

print(r_matrix)
print(r_report)
print(r_accuracy)

[[1533   65]
 [ 310   92]]
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1598
           1       0.59      0.23      0.33       402

    accuracy                           0.81      2000
   macro avg       0.71      0.59      0.61      2000
weighted avg       0.78      0.81      0.78      2000

0.8125


In [15]:
from sklearn.svm import SVC

s = SVC(kernel="linear",random_state=50)
s.fit(X_train,y_train)

s_pred = s.predict(X_test)
s_matrix = confusion_matrix(y_test,s_pred)
s_report = classification_report(y_test,s_pred)
s_accuracy = accuracy_score(y_test,s_pred)

print(s_matrix)
print(s_report)
print(s_accuracy)

[[1598    0]
 [ 402    0]]
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1598
           1       0.00      0.00      0.00       402

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.44      2000
weighted avg       0.64      0.80      0.71      2000

0.799


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

g = GradientBoostingClassifier(n_estimators=100,random_state=50)
g.fit(X_train,y_train)

g_pred = g.predict(X_test)
g_matrix = confusion_matrix(y_test,g_pred)
g_report = classification_report(y_test,g_pred)
g_accuracy = accuracy_score(y_test,g_pred)

print(g_matrix)
print(g_report)
print(g_accuracy)

[[1535   63]
 [ 210  192]]
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1598
           1       0.75      0.48      0.58       402

    accuracy                           0.86      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000

0.8635
