<a href="https://colab.research.google.com/github/neelsoumya/python_machine_learning/blob/main/business_cases_feature_engneering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Module 5: Advanced Techniques and Business Applications
Level 7 Postgraduate Course

This module covers advanced feature engineering techniques including time series features,
text features, domain-specific features, and comprehensive business case studies.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from datetime import datetime, timedelta
import re

# Set random seed for reproducibility
np.random.seed(42)

print("=" * 60)
print("MODULE 5: ADVANCED TECHNIQUES AND BUSINESS APPLICATIONS")
print("=" * 60)

# 1. Time Series Feature Engineering
print("\n1. TIME SERIES FEATURE ENGINEERING")
print("-" * 40)

# Create time series data
dates = pd.date_range(start='2020-01-01', end='2023-12-31', freq='D')
n_samples = len(dates)

# Generate time series data
np.random.seed(42)
base_value = 100
trend = np.linspace(0, 50, n_samples)
seasonal = 10 * np.sin(2 * np.pi * np.arange(n_samples) / 365.25)  # Annual seasonality
weekly = 5 * np.sin(2 * np.pi * np.arange(n_samples) / 7)  # Weekly seasonality
noise = np.random.normal(0, 5, n_samples)

sales = base_value + trend + seasonal + weekly + noise

# Create DataFrame
ts_df = pd.DataFrame({
    'date': dates,
    'sales': sales
})

print("Time Series Dataset Shape:", ts_df.shape)
print("\nFirst 10 rows:")
print(ts_df.head(10))

# Extract time-based features
ts_df['year'] = ts_df['date'].dt.year
ts_df['month'] = ts_df['date'].dt.month
ts_df['day'] = ts_df['date'].dt.day
ts_df['day_of_week'] = ts_df['date'].dt.dayofweek
ts_df['quarter'] = ts_df['date'].dt.quarter
ts_df['is_weekend'] = ts_df['day_of_week'].isin([5, 6]).astype(int)
ts_df['is_month_start'] = ts_df['date'].dt.is_month_start.astype(int)
ts_df['is_month_end'] = ts_df['date'].dt.is_month_end.astype(int)

# Cyclical encoding for periodic features
ts_df['month_sin'] = np.sin(2 * np.pi * ts_df['month'] / 12)
ts_df['month_cos'] = np.cos(2 * np.pi * ts_df['month'] / 12)
ts_df['day_of_week_sin'] = np.sin(2 * np.pi * ts_df['day_of_week'] / 7)
ts_df['day_of_week_cos'] = np.cos(2 * np.pi * ts_df['day_of_week'] / 7)

# Lag features
ts_df['sales_lag_1'] = ts_df['sales'].shift(1)
ts_df['sales_lag_7'] = ts_df['sales'].shift(7)
ts_df['sales_lag_30'] = ts_df['sales'].shift(30)

# Rolling statistics
ts_df['sales_rolling_mean_7'] = ts_df['sales'].rolling(window=7, min_periods=1).mean()
ts_df['sales_rolling_std_7'] = ts_df['sales'].rolling(window=7, min_periods=1).std()
ts_df['sales_rolling_mean_30'] = ts_df['sales'].rolling(window=30, min_periods=1).mean()

# Difference features
ts_df['sales_diff_1'] = ts_df['sales'].diff(1)
ts_df['sales_diff_7'] = ts_df['sales'].diff(7)

print("\nTime Series Features Created:")
print("- Basic time features: year, month, day, day_of_week, quarter")
print("- Binary features: is_weekend, is_month_start, is_month_end")
print("- Cyclical features: month_sin/cos, day_of_week_sin/cos")
print("- Lag features: sales_lag_1, sales_lag_7, sales_lag_30")
print("- Rolling features: rolling_mean_7, rolling_std_7, rolling_mean_30")
print("- Difference features: sales_diff_1, sales_diff_7")

# 2. Text Feature Engineering
print("\n2. TEXT FEATURE ENGINEERING")
print("-" * 40)

# Create sample text data
texts = [
    "I love this product! It's amazing and works perfectly.",
    "This is the worst purchase I've ever made. Terrible quality.",
    "Good product, fast delivery, would recommend.",
    "Not bad, but could be better. Average experience.",
    "Excellent service and high-quality product. Highly recommended!",
    "Disappointed with the product. Poor customer service.",
    "Great value for money. Very satisfied with purchase.",
    "Product arrived damaged. Very unhappy with service.",
    "Amazing experience! Best purchase ever made.",
    "Okay product, nothing special. Expected better."
]

# Create sentiment labels (1: positive, 0: negative)
sentiments = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Create DataFrame
text_df = pd.DataFrame({
    'text': texts,
    'sentiment': sentiments
})

print("Text Dataset:")
print(text_df)

# Basic text features
text_df['text_length'] = text_df['text'].str.len()
text_df['word_count'] = text_df['text'].str.split().str.len()
text_df['avg_word_length'] = text_df['text'].str.split().apply(
    lambda x: np.mean([len(word) for word in x]) if x else 0
)

# Sentiment indicators
positive_words = ['love', 'amazing', 'perfect', 'good', 'excellent', 'great', 'best', 'satisfied']
negative_words = ['worst', 'terrible', 'bad', 'disappointed', 'poor', 'damaged', 'unhappy']

text_df['positive_word_count'] = text_df['text'].str.lower().apply(
    lambda x: sum(1 for word in positive_words if word in x)
)
text_df['negative_word_count'] = text_df['text'].str.lower().apply(
    lambda x: sum(1 for word in negative_words if word in x)
)
text_df['sentiment_score'] = text_df['positive_word_count'] - text_df['negative_word_count']

# TF-IDF features
tfidf = TfidfVectorizer(max_features=10, stop_words='english')
tfidf_features = tfidf.fit_transform(text_df['text'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf.get_feature_names_out())

# Combine with original features
text_df = pd.concat([text_df, tfidf_df], axis=1)

print("\nText Features Created:")
print("- Basic features: text_length, word_count, avg_word_length")
print("- Sentiment features: positive_word_count, negative_word_count, sentiment_score")
print("- TF-IDF features: 10 most important words")

print("\nText Features (first 5 rows):")
print(text_df.head())

# 3. Domain-Specific Feature Engineering
print("\n3. DOMAIN-SPECIFIC FEATURE ENGINEERING")
print("-" * 40)

# 3.1 Financial Domain Features
print("\n3.1 FINANCIAL DOMAIN FEATURES")
print("-" * 30)

# Create financial data
n_samples = 1000
np.random.seed(42)

financial_data = {
    'income': np.random.normal(50000, 20000, n_samples),
    'debt': np.random.normal(20000, 10000, n_samples),
    'credit_score': np.random.normal(700, 100, n_samples),
    'age': np.random.normal(35, 10, n_samples),
    'employment_years': np.random.normal(5, 3, n_samples)
}

financial_df = pd.DataFrame(financial_data)

# Financial ratios and features
financial_df['debt_to_income_ratio'] = financial_df['debt'] / (financial_df['income'] + 1e-8)
financial_df['income_per_age'] = financial_df['income'] / (financial_df['age'] + 1e-8)
financial_df['credit_utilization'] = financial_df['debt'] / (financial_df['income'] * 0.3 + 1e-8)
financial_df['employment_stability'] = financial_df['employment_years'] / (financial_df['age'] - 18 + 1e-8)

# Risk categories
financial_df['income_category'] = pd.cut(financial_df['income'],
                                        bins=[0, 30000, 60000, 100000, np.inf],
                                        labels=['Low', 'Medium', 'High', 'Very High'])
financial_df['credit_risk'] = pd.cut(financial_df['credit_score'],
                                    bins=[0, 580, 670, 740, 800, 850],
                                    labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])

print("Financial Features Created:")
print("- Ratios: debt_to_income_ratio, income_per_age, credit_utilization, employment_stability")
print("- Categories: income_category, credit_risk")

# 3.2 E-commerce Domain Features
print("\n3.2 E-COMMERCE DOMAIN FEATURES")
print("-" * 30)

# Create e-commerce data
ecommerce_data = {
    'purchase_amount': np.random.exponential(50, n_samples),
    'items_in_cart': np.random.poisson(3, n_samples),
    'days_since_last_purchase': np.random.exponential(30, n_samples),
    'total_purchases': np.random.poisson(10, n_samples),
    'avg_order_value': np.random.normal(75, 25, n_samples),
    'customer_age': np.random.normal(35, 10, n_samples)
}

ecommerce_df = pd.DataFrame(ecommerce_data)

# E-commerce specific features
ecommerce_df['basket_size'] = ecommerce_df['purchase_amount'] / (ecommerce_df['items_in_cart'] + 1e-8)
ecommerce_df['purchase_frequency'] = 365 / (ecommerce_df['days_since_last_purchase'] + 1e-8)
ecommerce_df['customer_lifetime_value'] = ecommerce_df['total_purchases'] * ecommerce_df['avg_order_value']
ecommerce_df['churn_risk'] = 1 / (ecommerce_df['days_since_last_purchase'] + 1e-8)

# Customer segments
ecommerce_df['customer_segment'] = pd.cut(ecommerce_df['customer_lifetime_value'],
                                         bins=[0, 500, 1000, 2000, np.inf],
                                         labels=['Bronze', 'Silver', 'Gold', 'Platinum'])

print("E-commerce Features Created:")
print("- Business metrics: basket_size, purchase_frequency, customer_lifetime_value, churn_risk")
print("- Segmentation: customer_segment")

# 4. Advanced Feature Interactions
print("\n4. ADVANCED FEATURE INTERACTIONS")
print("-" * 40)

# Combine datasets for interaction features
combined_df = pd.concat([financial_df, ecommerce_df], axis=1)

# Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
poly_features = poly.fit_transform(combined_df[['income', 'age', 'credit_score']])
poly_feature_names = poly.get_feature_names_out(['income', 'age', 'credit_score'])

poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
combined_df = pd.concat([combined_df, poly_df], axis=1)

# Custom interaction features
combined_df['income_age_interaction'] = combined_df['income'] * combined_df['age']
combined_df['credit_income_ratio'] = combined_df['credit_score'] / (combined_df['income'] / 1000 + 1e-8)
combined_df['purchase_power_index'] = (combined_df['income'] * combined_df['credit_score']) / (combined_df['age'] + 1e-8)

print("Advanced Interaction Features Created:")
print("- Polynomial features: income*age, income*credit_score, age*credit_score")
print("- Custom interactions: income_age_interaction, credit_income_ratio, purchase_power_index")

# 5. Feature Engineering Pipeline
print("\n5. FEATURE ENGINEERING PIPELINE")
print("-" * 40)

class FeatureEngineeringPipeline:
    """Simple feature engineering pipeline"""

    def __init__(self):
        self.scaler = StandardScaler()
        self.feature_names = []

    def create_basic_features(self, df):
        """Create basic numerical features"""
        df_processed = df.copy()

        # Basic transformations
        for col in df.select_dtypes(include=[np.number]).columns:
            if col not in ['target', 'sentiment']:  # Skip target variables
                df_processed[f'{col}_squared'] = df[col] ** 2
                df_processed[f'{col}_log'] = np.log1p(np.abs(df[col]))

        return df_processed

    def create_ratio_features(self, df, feature_pairs):
        """Create ratio features between pairs of features"""
        df_processed = df.copy()

        for feature1, feature2 in feature_pairs:
            if feature1 in df.columns and feature2 in df.columns:
                df_processed[f'{feature1}_{feature2}_ratio'] = df[feature1] / (df[feature2] + 1e-8)

        return df_processed

    def create_statistical_features(self, df, window_sizes=[7, 30]):
        """Create rolling statistical features"""
        df_processed = df.copy()

        for col in df.select_dtypes(include=[np.number]).columns:
            if col not in ['target', 'sentiment']:
                for window in window_sizes:
                    df_processed[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
                    df_processed[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()

        return df_processed

    def fit_transform(self, df, target_col=None):
        """Apply all feature engineering steps"""
        df_processed = df.copy()

        # Apply feature engineering steps
        df_processed = self.create_basic_features(df_processed)
        df_processed = self.create_ratio_features(df_processed, [('income', 'age'), ('debt', 'income')])
        df_processed = self.create_statistical_features(df_processed)

        # Store feature names
        self.feature_names = [col for col in df_processed.columns if col not in ['target', 'sentiment']]

        return df_processed

# Apply pipeline
pipeline = FeatureEngineeringPipeline()
combined_df_engineered = pipeline.fit_transform(combined_df)

print(f"Original features: {len(combined_df.columns)}")
print(f"Engineered features: {len(combined_df_engineered.columns)}")
print(f"New features created: {len(combined_df_engineered.columns) - len(combined_df.columns)}")

# 6. Model Performance with Advanced Features
print("\n6. MODEL PERFORMANCE WITH ADVANCED FEATURES")
print("-" * 40)

# Create target variable for demonstration
combined_df_engineered['target'] = (
    combined_df_engineered['income'] * 0.3 +
    combined_df_engineered['credit_score'] * 0.2 +
    combined_df_engineered['customer_lifetime_value'] * 0.1 +
    np.random.normal(0, 10, len(combined_df_engineered))
)

# Prepare data
X = combined_df_engineered.drop(['target'], axis=1)
y = combined_df_engineered['target']

# Remove non-numeric columns for simplicity
X = X.select_dtypes(include=[np.number])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compare different feature sets
feature_sets = {
    'Original': ['income', 'debt', 'credit_score', 'age', 'employment_years'],
    'Basic_Engineered': ['income', 'debt', 'credit_score', 'age', 'employment_years',
                        'income_squared', 'credit_score_squared', 'debt_to_income_ratio'],
    'Advanced_Engineered': X_train.columns.tolist()
}

results = {}

for name, features in feature_sets.items():
    if name == 'Advanced_Engineered':
        X_train_subset = X_train
        X_test_subset = X_test
    else:
        X_train_subset = X_train[features]
        X_test_subset = X_test[features]

    # Handle missing values
    X_train_subset = X_train_subset.fillna(X_train_subset.mean())
    X_test_subset = X_test_subset.fillna(X_test_subset.mean())

    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_subset, y_train)

    # Predictions
    y_pred = model.predict(X_test_subset)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation
    cv_scores = cross_val_score(model, X_train_subset, y_train, cv=5, scoring='r2')

    results[name] = {
        'n_features': len(features) if name != 'Advanced_Engineered' else X_train_subset.shape[1],
        'MSE': mse,
        'R²': r2,
        'CV_R²_Mean': cv_scores.mean(),
        'CV_R²_Std': cv_scores.std()
    }

print("Model Performance Comparison:")
print(f"{'Feature Set':<20} {'Features':<10} {'MSE':<10} {'R²':<10} {'CV_R²':<15}")
print("-" * 70)
for name, metrics in results.items():
    print(f"{name:<20} {metrics['n_features']:<10} {metrics['MSE']:<10.2f} "
          f"{metrics['R²']:<10.4f} {metrics['CV_R²_Mean']:<10.4f} (+/- {metrics['CV_R²_Std']*2:.4f})")

# 7. Business Case Study: Customer Churn Prediction
print("\n7. BUSINESS CASE STUDY: CUSTOMER CHURN PREDICTION")
print("-" * 40)

# Create customer churn dataset
np.random.seed(42)
n_customers = 1000

churn_data = {
    'customer_id': range(1, n_customers + 1),
    'tenure_months': np.random.exponential(24, n_customers),
    'monthly_charges': np.random.normal(65, 20, n_customers),
    'total_charges': np.random.normal(1500, 800, n_customers),
    'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers),
    'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_customers),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_customers),
    'online_security': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'tech_support': np.random.choice(['Yes', 'No', 'No internet service'], n_customers)
}

churn_df = pd.DataFrame(churn_data)

# Create churn target based on business logic
churn_probability = (
    (churn_df['tenure_months'] < 12) * 0.3 +
    (churn_df['monthly_charges'] > 80) * 0.2 +
    (churn_df['contract_type'] == 'Month-to-month') * 0.4 +
    (churn_df['payment_method'] == 'Electronic check') * 0.1 +
    (churn_df['online_security'] == 'No') * 0.1 +
    np.random.normal(0, 0.1, n_customers)
)

churn_df['churn'] = (churn_probability > 0.5).astype(int)

# Feature engineering for churn prediction
churn_df['avg_monthly_charge'] = churn_df['total_charges'] / (churn_df['tenure_months'] + 1e-8)
churn_df['tenure_years'] = churn_df['tenure_months'] / 12
churn_df['contract_commitment'] = churn_df['contract_type'].map({
    'Month-to-month': 0, 'One year': 1, 'Two year': 2
})
churn_df['has_internet'] = (churn_df['internet_service'] != 'No').astype(int)
churn_df['has_security'] = (churn_df['online_security'] == 'Yes').astype(int)
churn_df['has_support'] = (churn_df['tech_support'] == 'Yes').astype(int)

# Risk score
churn_df['churn_risk_score'] = (
    (churn_df['tenure_months'] < 12) * 30 +
    (churn_df['monthly_charges'] > 80) * 20 +
    (churn_df['contract_type'] == 'Month-to-month') * 40 +
    (churn_df['payment_method'] == 'Electronic check') * 10 +
    (churn_df['online_security'] == 'No') * 10
)

print("Customer Churn Features Created:")
print("- Business metrics: avg_monthly_charge, tenure_years, contract_commitment")
print("- Service features: has_internet, has_security, has_support")
print("- Risk assessment: churn_risk_score")

print(f"\nChurn Rate: {churn_df['churn'].mean():.2%}")
print(f"Average Churn Risk Score: {churn_df['churn_risk_score'].mean():.1f}")

# 8. Visualization
print("\n8. VISUALIZATION")
print("-" * 40)

plt.figure(figsize=(20, 12))

# Time series features
plt.subplot(3, 4, 1)
ts_df[['sales', 'sales_rolling_mean_7', 'sales_rolling_mean_30']].plot()
plt.title('Time Series with Rolling Averages')
plt.legend()

# Text features
plt.subplot(3, 4, 2)
text_df[['text_length', 'word_count', 'sentiment_score']].boxplot()
plt.title('Text Feature Distributions')

# Financial features
plt.subplot(3, 4, 3)
financial_df[['debt_to_income_ratio', 'credit_utilization']].hist(bins=20, alpha=0.7)
plt.title('Financial Ratios Distribution')

# E-commerce features
plt.subplot(3, 4, 4)
ecommerce_df['customer_segment'].value_counts().plot(kind='bar')
plt.title('Customer Segments')
plt.xticks(rotation=45)

# Model performance comparison
plt.subplot(3, 4, 5)
methods = list(results.keys())
r2_scores = [results[method]['R²'] for method in methods]
plt.bar(methods, r2_scores, color=['skyblue', 'lightgreen', 'lightcoral'])
plt.title('Model Performance (R² Score)')
plt.ylabel('R² Score')
plt.xticks(rotation=45)

# Churn analysis
plt.subplot(3, 4, 6)
churn_df.groupby('contract_type')['churn'].mean().plot(kind='bar')
plt.title('Churn Rate by Contract Type')
plt.ylabel('Churn Rate')
plt.xticks(rotation=45)

# Feature importance
plt.subplot(3, 4, 7)
best_features = feature_sets['Advanced_Engineered']
X_best = X_train[best_features].fillna(X_train[best_features].mean())
y_best = y_train

model_best = RandomForestRegressor(n_estimators=100, random_state=42)
model_best.fit(X_best, y_best)

feature_importance = pd.DataFrame({
    'feature': best_features,
    'importance': model_best.feature_importances_
}).sort_values('importance', ascending=True).tail(10)

plt.barh(range(len(feature_importance)), feature_importance['importance'])
plt.yticks(range(len(feature_importance)), feature_importance['feature'])
plt.title('Top 10 Feature Importance')

# Churn risk distribution
plt.subplot(3, 4, 8)
churn_df['churn_risk_score'].hist(bins=20, alpha=0.7)
plt.title('Churn Risk Score Distribution')
plt.xlabel('Risk Score')

plt.tight_layout()
plt.show()

# 9. Business Applications Summary
print("\n9. BUSINESS APPLICATIONS SUMMARY")
print("-" * 40)

print("""
Comprehensive Business Applications of Advanced Feature Engineering:

1. **Financial Services**:
   - Credit Risk Assessment: debt_to_income_ratio, credit_utilization, employment_stability
   - Fraud Detection: transaction patterns, behavioral biometrics, temporal features
   - Investment Portfolio: market indicators, volatility measures, correlation features

2. **E-commerce & Retail**:
   - Customer Segmentation: RFM analysis, purchase patterns, lifetime value
   - Demand Forecasting: seasonal patterns, promotional effects, external factors
   - Inventory Optimization: lead time features, demand variability, supplier performance

3. **Healthcare**:
   - Disease Prediction: biomarker ratios, temporal patterns, genetic interactions
   - Treatment Effectiveness: patient history, medication interactions, compliance metrics
   - Resource Planning: patient flow patterns, seasonal variations, capacity utilization

4. **Manufacturing**:
   - Quality Control: sensor fusion, process parameters, environmental factors
   - Predictive Maintenance: equipment health indicators, failure patterns, usage metrics
   - Supply Chain: lead times, demand variability, supplier performance

5. **Marketing & Advertising**:
   - Customer Lifetime Value: purchase patterns, engagement metrics, churn indicators
   - Campaign Optimization: response patterns, channel effectiveness, timing features
   - Personalization: behavioral patterns, preference indicators, contextual features

6. **Telecommunications**:
   - Customer Churn: usage patterns, service quality, billing issues
   - Network Optimization: traffic patterns, capacity utilization, performance metrics
   - Fraud Detection: usage anomalies, location patterns, device fingerprinting
""")

print("\n" + "=" * 60)
print("END OF MODULE 5")
print("=" * 60)


MODULE 5: ADVANCED TECHNIQUES AND BUSINESS APPLICATIONS

1. TIME SERIES FEATURE ENGINEERING
----------------------------------------
Time Series Dataset Shape: (1461, 2)

First 10 rows:
        date       sales
0 2020-01-01  102.483571
1 2020-01-02  103.424098
2 2020-01-03  108.525556
3 2020-01-04  110.403151
4 2020-01-05   97.484355
5 2020-01-06   94.984970
6 2020-01-07  105.222700
7 2020-01-08  105.278161
8 2020-01-09  103.207612
9 2020-01-10  109.437699

Time Series Features Created:
- Basic time features: year, month, day, day_of_week, quarter
- Binary features: is_weekend, is_month_start, is_month_end
- Cyclical features: month_sin/cos, day_of_week_sin/cos
- Lag features: sales_lag_1, sales_lag_7, sales_lag_30
- Rolling features: rolling_mean_7, rolling_std_7, rolling_mean_30
- Difference features: sales_diff_1, sales_diff_7

2. TEXT FEATURE ENGINEERING
----------------------------------------
Text Dataset:
                                                text  sentiment
0  I love 

ValueError: Cannot set a DataFrame with multiple columns to the single column income_age_interaction