In [None]:
# -*- coding: utf-8 -*-
"""
Toy Example: Basic End-to-End ML Workflow (Classification)

Demonstrates the key steps for a timed ML technical interview scenario:
1. Load & Inspect Data
2. Basic EDA & Preprocessing Strategy
3. Train/Test Split
4. Preprocessing (Imputation, Encoding, Scaling) using Pipelines
5. Train Baseline Model
6. Evaluate Model
"""

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# --- 1. Create & Load Toy Dataset ---
# In a real scenario, you'd use pd.read_csv() or similar
data = {
    'Age': [25, 45, 35, 50, 23, np.nan, 60, 70, 31, 41],
    'Salary': [50000, 80000, 60000, 120000, 45000, 55000, np.nan, 150000, 52000, 75000],
    'Department': ['HR', 'IT', 'Marketing', 'IT', 'HR', 'Marketing', 'Finance', 'IT', np.nan, 'Marketing'],
    'ExperienceLevel': ['Entry', 'Senior', 'Mid', 'Senior', 'Entry', 'Mid', 'Senior', 'Senior', 'Mid', 'Mid'],
    # Target variable: 1 if likely to churn, 0 otherwise
    'Churn': [0, 1, 0, 1, 0, 0, 1, 1, 0, 0]
}
df = pd.DataFrame(data)


In [None]:
print("--- Initial Data ---")
print(df.head())
print("\n--- Data Info ---")
print(df.info())
print("\n--- Basic Statistics (Numerical) ---")
print(df.describe())
print("\n--- Basic Statistics (Categorical) ---")
print(df.describe(include='object'))
print("\n--- Missing Values ---")
print(df.isnull().sum())
print("\n--- Target Variable Distribution ---")
print(df['Churn'].value_counts(normalize=True)) # Use normalize=True for proportions

In [None]:


# --- 2. Define Features and Target ---
X = df.drop('Churn', axis=1)
y = df['Churn']

# Identify feature types (crucial for ColumnTransformer)
# Note: In a real scenario with many columns, you might do this programmatically
numerical_features = ['Age', 'Salary']
# One-Hot Encode 'Department' as it has no inherent order
nominal_features = ['Department']
# Ordinal Encode 'ExperienceLevel' as it has a clear order
ordinal_features = ['ExperienceLevel']
# Define the order for ordinal features
experience_order = ['Entry', 'Mid', 'Senior']


In [None]:
# --- 3. Train/Test Split ---
# Split *before* applying preprocessing that learns from data (like scaling or imputation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # Stratify for classification is good practice
print(f"\n--- Data Split ---")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


In [None]:

# --- 4. Preprocessing Pipelines ---
# Create pipeline for numerical features: Impute missing values with median, then scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for nominal categorical features: Impute missing with most frequent, then one-hot encode
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' is safer for unseen values in test set
])

# Create pipeline for ordinal categorical features: Impute missing with most frequent, then ordinal encode
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[experience_order])) # Pass the defined order
])



# Use ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nom', nominal_transformer, nominal_features),
        ('ord', ordinal_transformer, ordinal_features)
    ],
    remainder='passthrough' # Keep other columns (if any) - 'drop' is also common
)


In [None]:



# --- 5. Create Full Pipeline with Model ---
# Choose a simple baseline model
model = LogisticRegression(random_state=42)

# Chain the preprocessor and the model into a single pipeline
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])


In [None]:
print("\n--- Starting Model Training ---")

# --- 6. Train the Model ---
# Fit the entire pipeline on the training data
full_pipeline.fit(X_train, y_train)
print("--- Model Training Complete ---")



In [None]:

# --- 7. Evaluate the Model ---
print("\n--- Evaluating Model Performance ---")
# Predict on the test data
y_pred = full_pipeline.predict(X_test)
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1] # Get probabilities for AUC if needed

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy:.4f}")

print("\nClassification Report:")
# Note: With a tiny dataset like this, the report might look sparse or have warnings
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:

# --- 8. Interpretation & Next Steps (Simulated) ---
print("\n--- Summary & Next Steps ---")
print(f"Successfully trained a baseline Logistic Regression model.")
print(f"Achieved an accuracy of {accuracy:.4f} on the unseen test data.")
print("Next steps if more time allowed:")
print("- More detailed EDA (visualizations, outlier detection/handling).")
print("- Experiment with different imputation strategies (e.g., KNNImputer).")
print("- Feature engineering (e.g., interaction terms like Age*Salary).")
print("- Try more complex models (e.g., RandomForest, GradientBoosting).")
print("- Hyperparameter tuning using GridSearchCV or RandomizedSearchCV with Cross-Validation.")
print("- Deeper error analysis (examining misclassified examples).")