In [None]:
# -*- coding: utf-8 -*-
"""
Toy Example: Basic End-to-End ML Workflow (Classification)

Demonstrates the key steps for a timed ML technical interview scenario:
1. Load & Inspect Data
1b. Univariate Visualization (NEW)
2. Basic EDA & Preprocessing Strategy
3. Train/Test Split
4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines
5. Train Baseline Model with Cross-Validation (UPDATED)
6. Evaluate Model on Test Set
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set plot style
sns.set(style="whitegrid")

# --- 1. Create & Load Toy Dataset ---
data = {
    'Age': [25, 45, 35, 50, 23, np.nan, 60, 70, 31, 41],
    'Salary': [50000, 80000, 60000, 120000, 45000, 55000, np.nan, 150000, 52000, 75000],
    'Department': ['HR', 'IT', 'Marketing', 'IT', 'HR', 'Marketing', 'Finance', 'IT', np.nan, 'Marketing'],
    'ExperienceLevel': ['Entry', 'Senior', 'Mid', 'Senior', 'Entry', 'Mid', 'Senior', 'Senior', 'Mid', 'Mid'],
    # Target variable: 1 if likely to churn, 0 otherwise
    'Churn': [0, 1, 0, 1, 0, 0, 1, 1, 0, 0]
}
df = pd.DataFrame(data)

print("--- Initial Data ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())
print("\n--- Basic Statistics (Numerical) ---")
print(df.describe())
print("\n--- Basic Statistics (Categorical) ---")
print(df.describe(include='object'))
print("\n--- Missing Values ---")
print(df.isnull().sum())
print("\n--- Target Variable Distribution ---")
print(df['Churn'].value_counts(normalize=True))

# --- 1b. Univariate Visualization (NEW SECTION) ---
print("\n--- Generating Univariate Visualizations ---")

# Define features for visualization convenience
numerical_features_viz = ['Age', 'Salary']
categorical_features_viz = ['Department', 'ExperienceLevel', 'Churn'] # Include target for visualization

# Plot numerical features
print("Plotting numerical feature distributions...")
for col in numerical_features_viz:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

# Plot categorical features
print("Plotting categorical feature distributions...")
for col in categorical_features_viz:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index) # Order bars by frequency
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right') # Rotate labels if they overlap
    plt.tight_layout() # Adjust layout
    plt.show()

# --- 2. Define Features and Target (for modeling) ---
X = df.drop('Churn', axis=1)
y = df['Churn']

# Identify feature types for ColumnTransformer
numerical_features = ['Age', 'Salary']
nominal_features = ['Department']
ordinal_features = ['ExperienceLevel']
experience_order = ['Entry', 'Mid', 'Senior']

# --- 3. Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print(f"\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# --- 4. Preprocessing Pipelines ---
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[experience_order]))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
    remainder='passthrough'
)

# --- 5. Create Full Pipeline & Train with Cross-Validation (UPDATED SECTION) ---
model = LogisticRegression(random_state=42)

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

print("\n--- Starting Cross-Validation on Training Data ---")
# Perform 5-fold cross-validation (common choice)
# Scoring can be 'accuracy', 'f1', 'roc_auc', etc. depending on the problem
cv_scores = cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring='accuracy')

print(f"CV Scores for each fold: {cv_scores}")
print(f"Mean CV Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of CV Accuracy: {np.std(cv_scores):.4f}")
print("--- Cross-Validation Complete ---")

# --- Fit the final model on the entire training set ---
# This is necessary to make predictions on the unseen test set
print("\n--- Fitting Final Model on Entire Training Set ---")
full_pipeline.fit(X_train, y_train)
print("--- Model Training Complete ---")


# --- 6. Evaluate the Model on Test Set ---
print("\n--- Evaluating Model Performance on Test Set ---")
y_pred = full_pipeline.predict(X_test)
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# --- 7. Interpretation & Next Steps (Simulated) ---
print("\n--- Summary & Next Steps ---")
print(f"Successfully trained and evaluated a baseline Logistic Regression model.")
print(f"Cross-validation on the training set yielded a mean accuracy of {np.mean(cv_scores):.4f}.")
print(f"Achieved an accuracy of {accuracy:.4f} on the unseen test data.")
print("Next steps if more time allowed:")
print("- More detailed EDA (bivariate analysis, correlations).")
print("- Experiment with different imputation strategies.")
print("- Feature engineering.")
print("- Try more complex models & hyperparameter tuning (guided by CV results).")
print("- Deeper error analysis.")
