In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('Churn_Modelling.csv')

In [2]:
df_fe = df.copy()

df_fe['BalanceSalaryRatio'] = df_fe['Balance'] / df_fe['EstimatedSalary']
df_fe['TenureByAge'] = df_fe['Tenure'] / df_fe['Age']
df_fe['CreditScoreGivenAge'] = df_fe['CreditScore'] / df_fe['Age']

df_fe = df_fe.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

print("Yeni feature'lar eklendi.")
print(df_fe.head())

Yeni feature'lar eklendi.
   CreditScore Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619    France  Female   42       2       0.00              1   
1          608     Spain  Female   41       1   83807.86              1   
2          502    France  Female   42       8  159660.80              3   
3          699    France  Female   39       1       0.00              2   
4          850     Spain  Female   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  BalanceSalaryRatio  \
0          1               1        101348.88       1            0.000000   
1          0               1        112542.58       0            0.744677   
2          1               0        113931.57       1            1.401375   
3          0               0         93826.63       0            0.000000   
4          1               1         79084.10       0            1.587055   

   TenureByAge  CreditScoreGivenAge  
0     0.047619        

In [3]:
df_encoded = pd.get_dummies(df_fe, columns=['Geography', 'Gender'], drop_first=True)

X = df_encoded.drop('Exited', axis=1)
y = df_encoded['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

random_search.fit(X_train, y_train)

print("En iyi parametreler:", random_search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
En iyi parametreler: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'class_weight': None}


In [5]:
best_model = random_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

print("Optimize Edilmiş Accuracy Score:", accuracy_score(y_test, y_pred_tuned))
print("\nClassification Report (Optimized):\n", classification_report(y_test, y_pred_tuned))

Optimize Edilmiş Accuracy Score: 0.869

Classification Report (Optimized):
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.47      0.59       393

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.75      2000
weighted avg       0.86      0.87      0.86      2000



In [6]:
import joblib
from google.colab import files

joblib.dump(best_model, 'churn_model.pkl')
files.download('churn_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>