In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import pipeline as pip
from sklearn import preprocessing as pre
from sklearn import impute as imp
from sklearn import compose as com
from sklearn import model_selection as mod
from sklearn import ensemble as ens
from sklearn import metrics as met

In [2]:
df = pd.read_csv('train.csv')

In [18]:
df.shape

(165034, 14)

In [3]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
df['Exited'].value_counts(normalize=True)

Exited
0    0.788401
1    0.211599
Name: proportion, dtype: float64

In [5]:
for col in df.select_dtypes(include='object').columns:
    print(col,len(df[col].value_counts()))

Surname 2797
Geography 3
Gender 2


In [6]:
(df['Surname'].value_counts() <= 500).sum()

2720

In [7]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,165034,82516,47641,0,41258,82516,123775,165033
CustomerId,165034,15692005,71398,15565701,15633141,15690169,15756824,15815690
CreditScore,165034,656,80,350,597,659,710,850
Age,165034,38,9,18,32,37,42,92
Tenure,165034,5,3,0,3,5,7,10
Balance,165034,55478,62818,0,0,0,119940,250898
NumOfProducts,165034,2,1,1,1,2,2,4
HasCrCard,165034,1,0,0,1,1,1,1
IsActiveMember,165034,0,0,0,0,0,1,1
EstimatedSalary,165034,112575,50293,12,74638,117948,155152,199992


In [8]:
df_raw = df.copy()
df_raw.drop([ 'id'],axis=1, inplace=True)

In [9]:
# 'Surname' sütunundaki değerlerin frekansını hesapla
surname_counts = df_raw['Surname'].value_counts()

# Frekansı 10 veya daha fazla olan soyadları bul
frequent_surnames = surname_counts[surname_counts >= 10].index

# Sadece bu soyadları içeren satırları seç
df_raw = df_raw[df_raw['Surname'].isin(frequent_surnames)]

In [10]:

surname_counts = df_raw['Surname'].value_counts()

less_than_100 = surname_counts[surname_counts < 100].index

df_raw['Surname'] = df_raw['Surname'].apply(lambda x: 'missing' if x in less_than_100 else x)


In [11]:

surname_counts = df_raw['Surname'].value_counts()

less_than_500 = surname_counts[surname_counts < 500].index

df_raw['Surname'] = df_raw['Surname'].apply(lambda x: 'missing2' if x in less_than_500 else x)


In [12]:
y = df_raw['Exited']
X = df_raw.drop('Exited', axis=1, )

In [13]:
X_train, X_test, y_train, y_test = mod.train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
num_pip = pip.Pipeline([
    ('scaler', pre.StandardScaler())
])

cat_pip = pip.Pipeline([
    ('encoder', pre.OneHotEncoder(handle_unknown='ignore'))
])

ct = com.ColumnTransformer([
    ('num', num_pip, X_train.select_dtypes(exclude='object').columns),
    ('cat', cat_pip, X_train.select_dtypes(include='object').columns)
])


pip_RF = pip.Pipeline([
    ('ct', ct),
    ('model', ens.RandomForestClassifier(random_state=42))
])

params = {
    'model__n_estimators': [100, 500]
}

grid_RF = mod.GridSearchCV(pip_RF, params, cv=3, scoring='precision')

In [None]:
for est in [grid_RF]:
    
    plt.rcParams['figure.figsize'] = [6, 3]
    #model fit 
    est.fit(X_train, y_train)
    
    # model name that is running...
    model_name = est.best_estimator_['model'].__class__.__name__
    #header
    print(model_name + ' is running...')
    
    training_score = est.best_score_
    print('Training precision score for '+ model_name + ' is ' + str(training_score))
    
    #calculate model test score
    y_pred = est.best_estimator_.predict(X_test)
    test_prec_score = met.precision_score(y_test, y_pred)
    test_acc_score = met.accuracy_score(y_test, y_pred)
    print('Test precision score for '+ model_name + ' is ' + str(test_prec_score))
    print('Test accuracy score for '+ model_name + ' is ' + str(test_acc_score))
    
    # confusion matrix
    cm = met.confusion_matrix(y_test, y_pred)
    # visualize
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix of '+ model_name )
    plt.show()
    
    # predict proba
    proba_predictions = est.best_estimator_.predict_proba(X_test)
    
    plt.figure(figsize=(12, 6))
    sns.histplot(proba_predictions[:, 1], bins=30, kde=True, color='blue', label='Positive Class Probability')
    plt.title('Probability Distribution for Positive Class '+ model_name  )
    plt.xlabel('Probability')
    plt.ylabel('Frequency')    
    plt.legend()
    plt.show()
    
    #precision recall curve
    y_scores = proba_predictions[:,1]
    precision, recall, thresholds = met.precision_recall_curve(y_test, y_scores)
    sns.set(style="whitegrid")
    plt.figure(figsize=(8, 6))
    plt.plot(thresholds, precision[1:], label="precision")
    plt.plot(thresholds, recall[1:], label="recall")
    plt.title('Precision-Recall Curve ' + model_name )
    plt.xlabel('Thresholds')
    plt.ylabel('Precision/Recall')
    plt.legend()
    plt.show()
    
    #roc curve for test set
    fpr, tpr, thresholds = met.roc_curve(y_test, y_scores)
    plt.plot(fpr, tpr)
    plt.xlabel("fpr")
    plt.ylabel("tpr")
    plt.title("ROC Curve " + model_name)
    plt.show()
    
    #auc score for test set
    auc_score = met.roc_auc_score(y_test, y_scores)
    print('Auc score of ' + model_name + ' is ' + str(auc_score))
    print('------------------------------------------------------')
    
