In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

def preprocess_data(df):
    # Identify numerical and categorical features
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    # Preprocessing for numerical data (imputation and scaling)
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Preprocessing for categorical data (imputation and one-hot encoding)
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Apply preprocessing
    df_preprocessed = preprocessor.fit_transform(df)
    return df_preprocessed, preprocessor


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def eda(df):
    print("Dataset Summary:")
    print(df.describe())

    print("\nMissing Values:")
    print(df.isnull().sum())

    # Distribution of Churn
    plt.figure(figsize=(6, 4))
    sns.countplot(x='churn', data=df)
    plt.title('Churn Distribution')
    plt.show()

    # Correlation Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
    plt.title('Correlation Heatmap')
    plt.show()


In [None]:
import numpy as np

def feature_engineering(df):
    # Example: Log transform on skewed numerical features
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        if np.abs(df[col].skew()) > 1:
            df[col] = np.log1p(df[col])
    return df


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

def train_models(X, y):
    models = {
        'RandomForest': RandomForestClassifier(),
        'LogisticRegression': LogisticRegression(),
        'XGBoost': XGBClassifier()
    }
    
    best_model = None
    best_score = 0
    
    for name, model in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        auc_score = roc_auc_score(y_test, y_pred)
        
        if auc_score > best_score:
            best_score = auc_score
            best_model = model
    
    return best_model


In [None]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"AUC-ROC: {auc_roc}")
