In [11]:
!pip install xgboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns



In [12]:
# Veri setini yükleme
df = pd.read_csv('Churn_Modelling.csv')

In [13]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [14]:
# Gereksiz sütunları kaldırma
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [15]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [16]:
# Kategorik değişkenleri dönüştürme
df = pd.get_dummies(df, columns=['Geography', 'Gender'], drop_first=True)

In [17]:
# Bağımlı ve bağımsız değişkenleri ayırma
X = df.drop('Exited', axis=1)
y = df['Exited']

In [18]:
# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Özellik ölçeklendirme
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Modelleri oluşturma
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}


In [21]:
# Her model için eğitim ve değerlendirme
results = {}
for name, model in models.items():
    # Model eğitimi
    model.fit(X_train_scaled, y_train)
    
    # Tahminler
    y_pred = model.predict(X_test_scaled)
    
    # Doğruluk hesaplama
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"\n{name} Model Performans Raporu:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.ylabel('Gerçek Değerler')
    plt.xlabel('Tahmin Edilen Değerler')
    plt.tight_layout()
    plt.savefig(f'{name.lower().replace(" ", "_")}_confusion_matrix.png')
    plt.close()


Random Forest Model Performans Raporu:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000


Logistic Regression Model Performans Raporu:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1607
           1       0.55      0.20      0.29       393

    accuracy                           0.81      2000
   macro avg       0.69      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000


XGBoost Model Performans Raporu:
              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1607
           1       0.72      0.55      0.63       393

    accuracy                           0.87      2000
   macro

In [22]:
# Modelleri doğruluklarına göre sıralama
sorted_results = dict(sorted(results.items(), key=lambda x: x[1], reverse=True))
print("\nModellerin Doğruluk Sıralaması:")
for model, accuracy in sorted_results.items():
    print(f"{model}: {accuracy:.4f}")


Modellerin Doğruluk Sıralaması:
XGBoost: 0.8695
Random Forest: 0.8665
Logistic Regression: 0.8110


In [23]:
# Her model için özellik önem sıralaması
for name, model in models.items():
    print(f"\n{name} Özellik Önem Sıralaması:")
    
    if name == 'Random Forest':
        importance = model.feature_importances_
    elif name == 'Logistic Regression':
        # Logistic Regression için katsayıların mutlak değerlerini kullan
        importance = np.abs(model.coef_[0])
    else:  # XGBoost
        importance = model.feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': importance
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    print(feature_importance)


Random Forest Özellik Önem Sıralaması:
              feature  importance
1                 Age    0.236922
7     EstimatedSalary    0.147558
0         CreditScore    0.143338
3             Balance    0.141612
4       NumOfProducts    0.131486
2              Tenure    0.082080
6      IsActiveMember    0.040725
8   Geography_Germany    0.026190
5           HasCrCard    0.018454
10        Gender_Male    0.018421
9     Geography_Spain    0.013214

Logistic Regression Özellik Önem Sıralaması:
              feature  importance
1                 Age    0.754217
6      IsActiveMember    0.533432
8   Geography_Germany    0.337190
10        Gender_Male    0.264718
3             Balance    0.161139
0         CreditScore    0.067683
4       NumOfProducts    0.060278
2              Tenure    0.042637
9     Geography_Spain    0.038960
7     EstimatedSalary    0.015744
5           HasCrCard    0.010236

XGBoost Özellik Önem Sıralaması:
              feature  importance
4       NumOfProducts    0.283

In [24]:
 # Özellik önem grafiği
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title(f'{name} Özellik Önem Sıralaması')
plt.tight_layout()
plt.savefig(f'{name.lower().replace(" ", "_")}_feature_importance.png')
plt.close()