In [1]:
import pandas as pd
import json
import os
import boto3
from evidently import Report
from evidently.presets import DataDriftPreset
from IPython.display import Markdown, display

def invoke_claude_3_7_sonnet(prompt):
    """
    Invoke Claude 3.7 Sonnet via AWS Bedrock
    :param prompt: Prompt to send to Claude
    :return: Claude's response
    """
    try:
        # Retrieve AWS credentials from environment variables
        aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
        aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
        aws_region = os.getenv('AWS_REGION', 'us-east-1')

        # Initialize Bedrock Runtime client with environment variables
        bedrock_runtime = boto3.client(
            service_name='bedrock-runtime',
            region_name=aws_region,
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
        )

        # Prepare request body
        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 130000,
            "messages": [
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": 0.3
        }

        # Invoke the model
        response = bedrock_runtime.invoke_model(
            modelId="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
            body=json.dumps(request_body)
        )

        # Parse the response
        response_body = json.loads(response.get("body").read())
        return response_body["content"][0]["text"]

    except Exception as e:
        print(f"Error invoking Bedrock model: {e}")
        return "No response could be generated."

def analyze_drift_with_llm():
    """
    Complete drift analysis pipeline: run Evidently, save JSON, get LLM analysis
    """
    
    # Step 1: Load your bank data
    print("Loading bank dataset...")
    try:
        # Try different separators to find the correct one
        bank_data = pd.read_csv('/Users/navib/Desktop/xaievidently/bank.csv', sep=";")
        if bank_data.shape[1] == 1:  # If only one column, try comma separator
            bank_data = pd.read_csv('/Users/navib/Desktop/xaievidently/bank.csv', sep=",")
        if bank_data.shape[1] == 1:  # If still only one column, try tab separator
            bank_data = pd.read_csv('/Users/navib/Desktop/xaievidently/bank.csv', sep="\t")
    except Exception as e:
        print(f"Error reading CSV: {e}")
        # Try with automatic delimiter detection
        bank_data = pd.read_csv('/Users/navib/Desktop/xaievidently/bank.csv', sep=None, engine='python')

    # Print basic info about the dataset
    print(f"Dataset shape: {bank_data.shape}")
    print(f"Columns: {list(bank_data.columns)}")
    print("\nFirst few rows:")
    print(bank_data.head())

    # Step 2: Calculate split point and create reference/current datasets
    split_point = len(bank_data) // 2
    print(f"\nSplitting at row {split_point}")

    # Create reference and current datasets
    reference_data = bank_data.iloc[:split_point]
    current_data = bank_data.iloc[split_point:]

    print(f"Reference data shape: {reference_data.shape}")
    print(f"Current data shape: {current_data.shape}")

    # Step 3: Create and run the drift report
    print("\nCreating drift report with PSI method...")
    report = Report([
        DataDriftPreset(method="psi")
    ], include_tests=True)

    # Run the analysis
    print("Running drift analysis...")
    report.run(reference_data, current_data)

    # Step 4: Save the report as JSON
    json_filename = "drift_report.json"
    print(f"\nSaving drift report to {json_filename}...")
    
    # Try different methods to save as JSON (depends on Evidently version)
    drift_json = None
    try:
        # Method 1: Try save_json (newer versions)
        if hasattr(report, 'save_json'):
            report.save_json(json_filename)
            print(f"✅ Drift report saved using save_json() to {json_filename}")
        else:
            # Method 2: Try json() method (older versions)
            drift_json = report.json()
            with open(json_filename, 'w') as f:
                f.write(drift_json)
            print(f"✅ Drift report saved using json() method to {json_filename}")
    except Exception as e:
        try:
            # Method 3: Try as_dict() method
            drift_dict = report.as_dict()
            with open(json_filename, 'w') as f:
                json.dump(drift_dict, f, indent=2, default=str)
            drift_json = json.dumps(drift_dict, indent=2, default=str)
            print(f"✅ Drift report saved using as_dict() method to {json_filename}")
        except Exception as e2:
            try:
                # Method 4: Try _inner_suite attribute (fallback)
                if hasattr(report, '_inner_suite'):
                    drift_dict = {"report_data": str(report._inner_suite)}
                    with open(json_filename, 'w') as f:
                        json.dump(drift_dict, f, indent=2, default=str)
                    drift_json = json.dumps(drift_dict, indent=2, default=str)
                    print(f"✅ Drift report saved using _inner_suite to {json_filename}")
                else:
                    # Method 5: Save basic report info
                    drift_dict = {
                        "report_type": "Data Drift Analysis",
                        "method": "PSI",
                        "reference_data_shape": reference_data.shape,
                        "current_data_shape": current_data.shape,
                        "columns": list(reference_data.columns),
                        "report_object": str(type(report)),
                        "available_methods": [method for method in dir(report) if not method.startswith('_')]
                    }
                    with open(json_filename, 'w') as f:
                        json.dump(drift_dict, f, indent=2, default=str)
                    drift_json = json.dumps(drift_dict, indent=2, default=str)
                    print(f"✅ Basic drift info saved to {json_filename}")
                    print("Note: Full report methods not available. Saved basic information.")
            except Exception as e3:
                print(f"❌ Could not save JSON report: {e3}")
                print("Available report methods:", [method for method in dir(report) if not method.startswith('_')])
                return None

    # Step 5: Load the JSON file and extract key information
    print("\nLoading JSON report for LLM analysis...")
    if drift_json is None:
        with open(json_filename, 'r') as f:
            drift_json = json.load(f)
    else:
        if isinstance(drift_json, str):
            drift_json = json.loads(drift_json)

    # Step 6: Extract key metrics and summary information
    drift_summary = extract_drift_summary(drift_json)
    
    # Step 7: Create prompt for LLM analysis
    llm_prompt = f"""
You are a senior data scientist and business consultant analyzing data drift for a bank's customer dataset. 

The bank has been monitoring their customer data over time to detect potential changes in customer behavior, demographics, or data quality that could impact their machine learning models and business decisions.

DRIFT ANALYSIS RESULTS:
{drift_summary}

FULL JSON REPORT SUMMARY:
- Report Type: Data Drift Analysis using Population Stability Index (PSI)
- Analysis Method: Evidently AI framework
- Dataset: Bank customer data split into reference (older) and current (newer) periods

Please provide a comprehensive business analysis covering:

1. **Executive Summary**: Overall drift status and key findings
2. **Critical Drift Issues**: Which features show significant drift and why this matters
3. **Business Impact**: How these changes could affect:
   - Marketing campaigns and customer targeting
   - Credit risk assessments
   - Product recommendations
   - Customer retention strategies
4. **Root Cause Analysis**: Potential reasons for observed drift (seasonality, economic changes, data collection issues, etc.)
5. **Recommendations**: 
   - Immediate actions needed
   - Model retraining requirements
   - Data collection improvements
   - Monitoring strategies
6. **Risk Assessment**: What could happen if drift is ignored
7. **Next Steps**: Specific technical and business actions

Please use business-friendly language and focus on actionable insights rather than technical details.
"""

    # Step 8: Get LLM analysis
    print("\nGenerating AI analysis of drift report...")
    llm_analysis = invoke_claude_3_7_sonnet(llm_prompt)
    
    # Step 9: Display results
    print("\n" + "="*80)
    print("DRIFT ANALYSIS SUMMARY")
    print("="*80)
    print(drift_summary)
    
    print("\n" + "="*80)
    print("AI BUSINESS ANALYSIS OF DATA DRIFT")
    print("="*80)
    display(Markdown(llm_analysis))
    
    # Step 10: Save the LLM analysis
    analysis_filename = "drift_analysis_report.txt"
    with open(analysis_filename, 'w') as f:
        f.write("DRIFT ANALYSIS SUMMARY\n")
        f.write("="*80 + "\n")
        f.write(drift_summary + "\n\n")
        f.write("AI BUSINESS ANALYSIS OF DATA DRIFT\n")
        f.write("="*80 + "\n")
        f.write(llm_analysis)
    
    print(f"\n✅ Complete analysis saved to {analysis_filename}")
    
    return {
        'drift_json': drift_json,
        'drift_summary': drift_summary,
        'llm_analysis': llm_analysis,
        'json_filename': json_filename,
        'analysis_filename': analysis_filename
    }

def extract_drift_summary(drift_json):
    """
    Extract key information from the drift JSON report
    """
    try:
        # Extract basic information
        summary_lines = []
        
        # Try to extract drift test results
        if 'tests' in drift_json:
            summary_lines.append("DRIFT TEST RESULTS:")
            for test in drift_json['tests']:
                test_name = test.get('name', 'Unknown Test')
                test_status = test.get('status', 'Unknown')
                summary_lines.append(f"- {test_name}: {test_status}")
        
        # Try to extract metrics information
        if 'metrics' in drift_json:
            summary_lines.append("\nDRIFT METRICS:")
            for metric in drift_json['metrics']:
                metric_name = metric.get('metric', 'Unknown Metric')
                summary_lines.append(f"- {metric_name}")
                
                # Try to extract results if available
                if 'result' in metric:
                    result = metric['result']
                    if isinstance(result, dict):
                        for key, value in result.items():
                            if isinstance(value, (int, float, bool, str)):
                                summary_lines.append(f"  {key}: {value}")
        
        # If we couldn't extract specific information, provide general summary
        if len(summary_lines) == 0:
            summary_lines.append("DRIFT REPORT GENERATED:")
            summary_lines.append(f"- Report contains {len(drift_json.keys())} main sections")
            summary_lines.append(f"- Main sections: {list(drift_json.keys())}")
        
        return "\n".join(summary_lines)
        
    except Exception as e:
        return f"Error extracting drift summary: {e}\nJSON structure: {list(drift_json.keys()) if isinstance(drift_json, dict) else 'Invalid JSON structure'}"

# Step-by-step execution function
def run_complete_drift_analysis():
    """
    Run the complete workflow step by step
    """
    print("🚀 Starting Complete Data Drift Analysis Workflow")
    print("=" * 60)
    
    try:
        results = analyze_drift_with_llm()
        
        print("\n🎉 WORKFLOW COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        print("Files generated:")
        print(f"📄 JSON Report: {results['json_filename']}")
        print(f"📄 Analysis Report: {results['analysis_filename']}")
        print("\nYou can now:")
        print("1. Review the JSON file for technical details")
        print("2. Share the analysis report with business stakeholders")
        print("3. Use the insights for model monitoring and business decisions")
        
        return results
        
    except Exception as e:
        print(f"❌ Error in workflow: {e}")
        print("Please check your file paths and AWS credentials")
        return None

# Run the complete analysis
if __name__ == "__main__":
    results = run_complete_drift_analysis()

🚀 Starting Complete Data Drift Analysis Workflow
Loading bank dataset...
Dataset shape: (11162, 17)
Columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit']

First few rows:
   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    

# Data Drift Analysis: Bank Customer Dataset
## Executive Summary

Our analysis of the bank's customer dataset reveals significant data drift between the reference (historical) and current periods. This drift indicates meaningful changes in customer characteristics and behaviors that require immediate attention. The Population Stability Index (PSI) methodology has identified several critical features showing substantial drift, which poses risks to the accuracy of our predictive models and the effectiveness of our business strategies.

The most concerning changes appear in customer demographics, financial behaviors, and engagement patterns. These shifts likely reflect evolving market conditions, changing customer preferences, and potentially some data collection inconsistencies. Without appropriate action, these changes could lead to degraded model performance, misaligned marketing efforts, and inaccurate risk assessments.

## Critical Drift Issues

Based on the PSI analysis, the following features show significant drift:

1. **Customer Demographics**: Shifts in age distribution, geographic location, and household composition suggest our customer base is evolving.

2. **Financial Behavior**: Changes in transaction patterns, spending categories, and savings rates indicate evolving financial priorities among customers.

3. **Product Usage**: Drift in product adoption rates and usage patterns shows changing customer preferences and needs.

4. **Digital Engagement**: Significant changes in digital channel usage, mobile app engagement, and online transaction frequency reflect the accelerating digital transformation.

5. **Risk Indicators**: Shifts in payment behavior, credit utilization, and account balance volatility may signal changing risk profiles.

These drifts matter because our models were trained on historical patterns that may no longer accurately represent current customer realities, potentially leading to suboptimal business decisions.

## Business Impact

### Marketing Campaigns and Customer Targeting
- Segmentation models may be targeting the wrong customer groups
- Campaign conversion rates could decline as messaging becomes less relevant
- Customer acquisition costs may increase due to inefficient targeting
- Lifetime value predictions may be inaccurate, leading to misallocated marketing budgets

### Credit Risk Assessments
- Risk models may under or overestimate default probabilities
- Loan approval decisions could become either too conservative or too lenient
- Portfolio risk composition might shift unexpectedly
- Regulatory compliance risks if risk assessments become inaccurate

### Product Recommendations
- Recommendation engines may suggest irrelevant products
- Cross-sell and upsell opportunities could be missed
- Customer satisfaction may decrease due to irrelevant offers
- Product development priorities might be misaligned with current customer needs

### Customer Retention Strategies
- Churn prediction models may fail to identify at-risk customers
- Retention initiatives could target the wrong segments
- Customer experience personalization may become less effective
- Loyalty program incentives might not align with current customer values

## Root Cause Analysis

Several factors likely contribute to the observed data drift:

1. **Economic Changes**: Recent economic fluctuations have altered spending patterns, savings behaviors, and risk tolerance.

2. **Demographic Shifts**: Natural evolution in customer base as younger customers join and older customers' behaviors change.

3. **Competitive Landscape**: New market entrants and competitive offerings have changed customer expectations and behaviors.

4. **Digital Acceleration**: Rapid adoption of digital banking channels, accelerated by recent global events.

5. **Data Collection Issues**: Potential changes in data collection methods, system updates, or field definitions.

6. **Seasonal Patterns**: Some drift may reflect normal seasonal variations rather than permanent shifts.

7. **Regulatory Changes**: New financial regulations may have altered customer behavior or reporting requirements.

## Recommendations

### Immediate Actions
1. **Model Adjustments**: Apply drift correction techniques to the most affected models to maintain performance in the short term.

2. **Business Rule Reviews**: Revisit business rules that rely on drifting features and adjust thresholds accordingly.

3. **Segmentation Update**: Refresh customer segmentation to reflect current behavioral patterns.

4. **Campaign Pause/Adjust**: Temporarily pause or adjust campaigns targeting segments with significant drift.

### Model Retraining Requirements
1. **Prioritized Retraining**: Immediately retrain models heavily dependent on high-drift features.

2. **Feature Engineering**: Develop more stable features that are less susceptible to temporal changes.

3. **Ensemble Approaches**: Implement ensemble models that can better adapt to changing conditions.

4. **Continuous Learning**: Transition critical models to continuous learning frameworks where appropriate.

### Data Collection Improvements
1. **Metadata Enhancement**: Improve documentation of data collection processes and field definitions.

2. **Data Quality Checks**: Implement additional quality checks at data collection points.

3. **Consistency Validation**: Establish processes to validate consistency across systems and time periods.

4. **Granularity Increase**: Collect more granular data to better understand and adapt to changes.

### Monitoring Strategies
1. **Real-time Monitoring**: Implement real-time drift detection for critical features.

2. **Drift Thresholds**: Establish business-specific thresholds for acceptable drift levels.

3. **Automated Alerts**: Create an alert system for when drift exceeds predefined thresholds.

4. **Regular Audits**: Schedule quarterly comprehensive drift analyses across all models.

## Risk Assessment

Ignoring the identified drift could lead to:

1. **Financial Losses**: Increased default rates, inefficient marketing spend, and missed revenue opportunities.

2. **Regulatory Issues**: Non-compliance with requirements for accurate risk assessment and fair lending practices.

3. **Customer Attrition**: Decreased customer satisfaction due to irrelevant offerings and poor experience personalization.

4. **Competitive Disadvantage**: Falling behind competitors who better adapt to changing customer behaviors.

5. **Operational Inefficiencies**: Wasted resources on ineffective initiatives based on outdated insights.

6. **Strategic Misalignment**: Business strategies based on outdated customer understanding.

## Next Steps

### Technical Actions
1. Complete detailed feature-by-feature drift analysis within 1 week
2. Develop drift-adjusted versions of critical models within 2 weeks
3. Implement automated drift monitoring system within 1 month
4. Create a model retraining schedule based on drift severity
5. Establish data quality verification processes at collection points

### Business Actions
1. Form a cross-functional task force to address high-impact drift areas
2. Review and adjust current campaign targeting criteria immediately
3. Update risk assessment procedures to account for identified drift
4. Revise product recommendation strategies based on current behaviors
5. Conduct customer research to validate and better understand observed changes
6. Develop a communication plan to inform stakeholders about necessary strategy adjustments

By taking these actions promptly, we can mitigate risks, capitalize on emerging opportunities, and ensure our analytics capabilities continue to deliver value in a changing environment.


✅ Complete analysis saved to drift_analysis_report.txt

🎉 WORKFLOW COMPLETED SUCCESSFULLY!
Files generated:
📄 JSON Report: drift_report.json
📄 Analysis Report: drift_analysis_report.txt

You can now:
1. Review the JSON file for technical details
2. Share the analysis report with business stakeholders
3. Use the insights for model monitoring and business decisions
