In [1]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

# 1. Veriyi yükle
df = pd.read_csv('/content/drive/MyDrive/churn_train.csv')
df = df.drop(['user_account_id'], axis=1)

# 2. Özelleştirilmiş ön işleyiciler
class AutoOutlierFlagger(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=10.0):
        self.threshold = threshold
        self.columns_to_flag = []
        self.bounds = {}

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            outliers = ((X[col] < lower) | (X[col] > upper)).sum()
            percentage = (outliers / len(X)) * 100
            if percentage > self.threshold:
                self.columns_to_flag.append(col)
                self.bounds[col] = (lower, upper)
        return self

    def transform(self, X):
        X_ = X.copy()
        for col in self.columns_to_flag:
            lower, upper = self.bounds[col]
            X_[f"{col}_outlier"] = ((X_[col] < lower) | (X_[col] > upper)).astype(int)
        return X_

class DropLowVariance(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.columns_to_drop = []

    def fit(self, X, y=None):
        variances = X.var()
        self.columns_to_drop = variances[variances < self.threshold].index.tolist()
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors='ignore')

class DropHighlyCorrelated(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.cols_to_drop = []

    def fit(self, X, y=None):
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.cols_to_drop = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_drop, errors='ignore')

# 3. Hedef değişken ve özellikleri ayır


# Modelleri tanımlayalım
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "XGBoost": xgb.XGBClassifier( eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

# Veri hazırlığı
X = df.drop(columns='churn')
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Sonuçları saklayacak bir sözlük
results = defaultdict(list)

# Model Pipeline içinde kullanılacak veri ön işleme adımları
def create_pipeline(model):
    return Pipeline(steps=[
        ('outlier_flags', AutoOutlierFlagger(threshold=10.0)),  # %10'dan fazla aykırı varsa flagle
        ('low_variance_filter', DropLowVariance()),  # Varyansı düşük olanları çıkar
        ('correlation_filter', DropHighlyCorrelated()),  # Korelasyonu yüksek olanları çıkar
        ('scaling', StandardScaler()),  # Veriyi standardlaştır
        ('classifier', model)  # Modeli ekle
    ])

# Modelleri sırasıyla eğitelim ve test edelim
for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Pipeline kurulumu
    pipeline = create_pipeline(model)

    # Modeli eğit
    pipeline.fit(X_train, y_train)

    # Tahmin yap
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]  # ROC AUC için

    # Performans değerlendirme
    results[model_name].append(accuracy_score(y_test, y_pred))  # Accuracy
    results[model_name].append(roc_auc_score(y_test, y_proba))  # ROC AUC
    results[model_name].append(classification_report(y_test, y_pred))  # Detaylı rapor

    # Eğitilen modeli kaydedelim
    joblib.dump(pipeline, f"{model_name}_model.joblib")  # Modeli kaydet

# Sonuçları yazdıralım
for model_name in results:
    print(f"\n{model_name} Model Results:")
    print(f"Accuracy: {results[model_name][0]:.4f}")
    print(f"ROC AUC Score: {results[model_name][1]:.4f}")
    print(f"Classification Report:\n{results[model_name][2]}")



Training RandomForest...
Training GradientBoosting...
Training AdaBoost...
Training XGBoost...
Training LightGBM...




[LightGBM] [Info] Number of positive: 8784, number of negative: 33216
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8340
[LightGBM] [Info] Number of data points in the train set: 42000, number of used features: 68
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.209143 -> initscore=-1.330100
[LightGBM] [Info] Start training from score -1.330100





RandomForest Model Results:
Accuracy: 0.8704
ROC AUC Score: 0.8953
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     14235
           1       0.72      0.62      0.67      3765

    accuracy                           0.87     18000
   macro avg       0.81      0.78      0.79     18000
weighted avg       0.87      0.87      0.87     18000


GradientBoosting Model Results:
Accuracy: 0.8734
ROC AUC Score: 0.9040
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92     14235
           1       0.70      0.68      0.69      3765

    accuracy                           0.87     18000
   macro avg       0.81      0.80      0.81     18000
weighted avg       0.87      0.87      0.87     18000


AdaBoost Model Results:
Accuracy: 0.8653
ROC AUC Score: 0.8968
Classification Report:
              precision    recall  f1-score   support

           0      

In [4]:
# Modeli yükleyelim
loaded_model = joblib.load("RandomForest_model.joblib")  # Örneğin RandomForest modelini yükledik

# Yüklenen model ile tahmin yapalım
y_pred_loaded = loaded_model.predict(X_test)
y_proba_loaded = loaded_model.predict_proba(X_test)[:, 1]

# Performans değerlendirme
print("Loaded Model - Accuracy: ", accuracy_score(y_test, y_pred_loaded))
print("Loaded Model - ROC AUC Score: ", roc_auc_score(y_test, y_proba_loaded))
print("Loaded Model - Classification Report:\n", classification_report(y_test, y_pred_loaded))


Loaded Model - Accuracy:  0.8704444444444445
Loaded Model - ROC AUC Score:  0.8953059230120847
Loaded Model - Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92     14235
           1       0.72      0.62      0.67      3765

    accuracy                           0.87     18000
   macro avg       0.81      0.78      0.79     18000
weighted avg       0.87      0.87      0.87     18000

