In [None]:
# MLOps Feedback Intent Training Pipeline

This notebook implements a complete MLOps pipeline for training intent classifiers using feedback data from the search platform.

## Features:
- Data export from backend APIs
- Feature engineering and preprocessing
- Model training with scikit-learn
- MLflow experiment tracking
- Model evaluation and metrics
- Automated retraining capabilities


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import requests
import json
from datetime import datetime, timedelta
import os
from typing import Dict, List, Tuple, Optional

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# MLflow for experiment tracking
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")


In [None]:
# Configuration
CONFIG = {
    'backend_url': 'http://localhost:8000',
    'mlflow_tracking_uri': 'http://localhost:5000',  # MLflow server
    'experiment_name': 'intent_classification',
    'model_name': 'feedback_intent_classifier',
    'test_size': 0.2,
    'random_state': 42,
    'cv_folds': 5
}

# Set MLflow tracking URI
mlflow.set_tracking_uri(CONFIG['mlflow_tracking_uri'])

print(f"Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


In [None]:
def fetch_training_data(backend_url: str, days_back: int = 30) -> pd.DataFrame:
    """
    Fetch training data from backend APIs
    """
    try:
        # Calculate date range
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)
        
        # Fetch feedback data with message content
        feedback_url = f"{backend_url}/api/feedback/export"
        params = {
            'format': 'json',
            'start_date': start_date.isoformat(),
            'end_date': end_date.isoformat()
        }
        
        print(f"Fetching feedback data from {feedback_url}")
        response = requests.get(feedback_url, params=params)
        response.raise_for_status()
        
        feedback_data = response.json()
        df = pd.DataFrame(feedback_data)
        
        print(f"Fetched {len(df)} feedback records")
        return df
        
    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

# Fetch the data
training_data = fetch_training_data(CONFIG['backend_url'])
print(f"Training data shape: {training_data.shape}")
if not training_data.empty:
    print(f"Columns: {list(training_data.columns)}")
    print(f"Sample data:")
    print(training_data.head())


In [None]:
def preprocess_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
    """
    Preprocess the training data for intent classification
    """
    if df.empty:
        return df, {}
    
    # Data cleaning
    df_clean = df.copy()
    
    # Remove null values
    df_clean = df_clean.dropna(subset=['message_content', 'rating'])
    
    # Create intent labels based on feedback patterns
    # This is a simplified approach - in practice, you'd have more sophisticated labeling
    def create_intent_label(row):
        content = str(row['message_content']).lower()
        rating = row['rating']
        
        # Simple rule-based intent classification for training
        if 'bill' in content or 'payment' in content or 'charge' in content:
            return 'billing'
        elif 'internet' in content or 'wifi' in content or 'connection' in content:
            return 'technical_support'
        elif 'channel' in content or 'tv' in content or 'cable' in content:
            return 'tv_support'
        elif 'account' in content or 'profile' in content or 'login' in content:
            return 'account_management'
        elif 'service' in content or 'plan' in content or 'upgrade' in content:
            return 'service_inquiry'
        else:
            return 'general_inquiry'
    
    df_clean['intent'] = df_clean.apply(create_intent_label, axis=1)
    
    # Feature engineering
    df_clean['message_length'] = df_clean['message_content'].str.len()
    df_clean['word_count'] = df_clean['message_content'].str.split().str.len()
    df_clean['has_question'] = df_clean['message_content'].str.contains('\\?').astype(int)
    
    # Create satisfaction labels
    df_clean['satisfaction'] = df_clean['rating'].apply(
        lambda x: 'positive' if x >= 4 else 'negative' if x <= 2 else 'neutral'
    )
    
    # Statistics
    stats = {
        'total_samples': len(df_clean),
        'intent_distribution': df_clean['intent'].value_counts().to_dict(),
        'satisfaction_distribution': df_clean['satisfaction'].value_counts().to_dict(),
        'avg_message_length': df_clean['message_length'].mean(),
        'avg_word_count': df_clean['word_count'].mean()
    }
    
    print("Data preprocessing completed!")
    print(f"Clean samples: {len(df_clean)}")
    print(f"Intent distribution: {stats['intent_distribution']}")
    print(f"Satisfaction distribution: {stats['satisfaction_distribution']}")
    
    return df_clean, stats

# Preprocess the data
if not training_data.empty:
    processed_data, data_stats = preprocess_data(training_data)
else:
    print("No training data available - using sample data for demonstration")
    # Create sample data for demonstration
    sample_data = pd.DataFrame({
        'message_content': [
            'My internet is not working properly',
            'I need help with my bill payment',
            'TV channels are not loading',
            'How to change my account password',
            'Want to upgrade my service plan',
            'General question about services'
        ] * 10,  # Repeat for more samples
        'rating': [1, 5, 2, 4, 5, 3] * 10,
        'created_at': [datetime.now() - timedelta(days=i) for i in range(60)]
    })
    processed_data, data_stats = preprocess_data(sample_data)


In [None]:
# Visualize the data distribution
if not processed_data.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Intent distribution
    intent_counts = processed_data['intent'].value_counts()
    axes[0, 0].bar(intent_counts.index, intent_counts.values)
    axes[0, 0].set_title('Intent Distribution')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Satisfaction distribution
    satisfaction_counts = processed_data['satisfaction'].value_counts()
    axes[0, 1].pie(satisfaction_counts.values, labels=satisfaction_counts.index, autopct='%1.1f%%')
    axes[0, 1].set_title('Satisfaction Distribution')
    
    # Message length distribution
    axes[1, 0].hist(processed_data['message_length'], bins=20, alpha=0.7)
    axes[1, 0].set_title('Message Length Distribution')
    axes[1, 0].set_xlabel('Message Length')
    axes[1, 0].set_ylabel('Frequency')
    
    # Rating vs Intent
    rating_intent = processed_data.groupby(['intent', 'rating']).size().unstack(fill_value=0)
    sns.heatmap(rating_intent, annot=True, fmt='d', ax=axes[1, 1])
    axes[1, 1].set_title('Rating vs Intent Heatmap')
    
    plt.tight_layout()
    plt.show()
    
    print("Data visualization completed!")


In [None]:
def create_ml_pipeline() -> Pipeline:
    """
    Create the ML pipeline for intent classification
    """
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=10000,
            ngram_range=(1, 2),
            stop_words='english',
            lowercase=True,
            strip_accents='ascii'
        )),
        ('classifier', LogisticRegression(
            random_state=CONFIG['random_state'],
            max_iter=1000,
            class_weight='balanced'
        ))
    ])
    
    return pipeline

def train_model(df: pd.DataFrame) -> Tuple[Pipeline, Dict]:
    """
    Train the intent classification model
    """
    if df.empty:
        raise ValueError("No data available for training")
    
    # Prepare features and target
    X = df['message_content']
    y = df['intent']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=CONFIG['test_size'], 
        random_state=CONFIG['random_state'],
        stratify=y
    )
    
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    
    # Create and train the model
    pipeline = create_ml_pipeline()
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(
        pipeline, X_train, y_train, 
        cv=CONFIG['cv_folds'], 
        scoring='accuracy'
    )
    
    metrics = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
        'feature_count': pipeline.named_steps['tfidf'].get_feature_names_out().shape[0]
    }
    
    print(f"Model trained successfully!")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"CV Mean Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    return pipeline, metrics

# Train the model
if not processed_data.empty:
    model, training_metrics = train_model(processed_data)
    print("\\nClassification Report:")
    print(classification_report(
        processed_data['intent'], 
        model.predict(processed_data['message_content'])
    ))


In [None]:
# MLflow Experiment Tracking
def log_experiment(model: Pipeline, metrics: Dict, data_stats: Dict):
    """
    Log the experiment to MLflow
    """
    try:
        # Set or create experiment
        experiment = mlflow.get_experiment_by_name(CONFIG['experiment_name'])
        if experiment is None:
            experiment_id = mlflow.create_experiment(CONFIG['experiment_name'])
        else:
            experiment_id = experiment.experiment_id
        
        with mlflow.start_run(experiment_id=experiment_id):
            # Log parameters
            mlflow.log_param("model_type", "logistic_regression_tfidf")
            mlflow.log_param("test_size", CONFIG['test_size'])
            mlflow.log_param("cv_folds", CONFIG['cv_folds'])
            mlflow.log_param("random_state", CONFIG['random_state'])
            mlflow.log_param("total_samples", data_stats['total_samples'])
            mlflow.log_param("feature_count", metrics['feature_count'])
            
            # Log TF-IDF parameters
            tfidf_params = model.named_steps['tfidf'].get_params()
            for param, value in tfidf_params.items():
                mlflow.log_param(f"tfidf_{param}", value)
            
            # Log classifier parameters
            clf_params = model.named_steps['classifier'].get_params()
            for param, value in clf_params.items():
                mlflow.log_param(f"classifier_{param}", value)
            
            # Log metrics
            mlflow.log_metric("accuracy", metrics['accuracy'])
            mlflow.log_metric("cv_mean_accuracy", metrics['cv_mean'])
            mlflow.log_metric("cv_std_accuracy", metrics['cv_std'])
            
            # Log per-class metrics
            for intent, class_metrics in metrics['classification_report'].items():
                if isinstance(class_metrics, dict):
                    for metric_name, value in class_metrics.items():
                        if isinstance(value, (int, float)):
                            mlflow.log_metric(f"{intent}_{metric_name}", value)
            
            # Log data distribution
            for intent, count in data_stats['intent_distribution'].items():
                mlflow.log_metric(f"intent_count_{intent}", count)
            
            # Log model
            mlflow.sklearn.log_model(
                model, 
                CONFIG['model_name'],
                registered_model_name=CONFIG['model_name']
            )
            
            # Log confusion matrix as artifact
            cm_df = pd.DataFrame(
                metrics['confusion_matrix'],
                index=model.classes_,
                columns=model.classes_
            )
            cm_df.to_csv("confusion_matrix.csv")
            mlflow.log_artifact("confusion_matrix.csv")
            
            # Log feature importance (top TF-IDF features)
            feature_names = model.named_steps['tfidf'].get_feature_names_out()
            feature_importance = np.abs(model.named_steps['classifier'].coef_).mean(axis=0)
            top_features = pd.DataFrame({
                'feature': feature_names,
                'importance': feature_importance
            }).nlargest(50, 'importance')
            top_features.to_csv("top_features.csv", index=False)
            mlflow.log_artifact("top_features.csv")
            
            run_id = mlflow.active_run().info.run_id
            print(f"Experiment logged successfully! Run ID: {run_id}")
            
    except Exception as e:
        print(f"Error logging to MLflow: {e}")
        print("Continuing without MLflow logging...")

# Log the experiment
if not processed_data.empty and 'model' in locals():
    log_experiment(model, training_metrics, data_stats)


In [None]:
# Model Evaluation and Visualization
def evaluate_model(model: Pipeline, df: pd.DataFrame):
    """
    Comprehensive model evaluation with visualizations
    """
    if df.empty:
        return
    
    X = df['message_content']
    y = df['intent']
    
    # Predictions
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)
    
    # Create evaluation plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', 
                xticklabels=model.classes_, 
                yticklabels=model.classes_,
                ax=axes[0, 0])
    axes[0, 0].set_title('Confusion Matrix')
    axes[0, 0].set_xlabel('Predicted')
    axes[0, 0].set_ylabel('Actual')
    
    # Feature Importance (Top 20)
    feature_names = model.named_steps['tfidf'].get_feature_names_out()
    feature_importance = np.abs(model.named_steps['classifier'].coef_).mean(axis=0)
    top_indices = np.argsort(feature_importance)[-20:]
    
    axes[0, 1].barh(range(20), feature_importance[top_indices])
    axes[0, 1].set_yticks(range(20))
    axes[0, 1].set_yticklabels([feature_names[i] for i in top_indices])
    axes[0, 1].set_title('Top 20 Features')
    axes[0, 1].set_xlabel('Importance')
    
    # Prediction Confidence Distribution
    max_probas = np.max(y_pred_proba, axis=1)
    axes[1, 0].hist(max_probas, bins=20, alpha=0.7, edgecolor='black')
    axes[1, 0].set_title('Prediction Confidence Distribution')
    axes[1, 0].set_xlabel('Max Probability')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].axvline(np.mean(max_probas), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(max_probas):.3f}')
    axes[1, 0].legend()
    
    # Per-class Performance
    report = classification_report(y, y_pred, output_dict=True)
    classes = [c for c in report.keys() if c not in ['accuracy', 'macro avg', 'weighted avg']]
    precision_scores = [report[c]['precision'] for c in classes]
    recall_scores = [report[c]['recall'] for c in classes]
    f1_scores = [report[c]['f1-score'] for c in classes]
    
    x = np.arange(len(classes))
    width = 0.25
    
    axes[1, 1].bar(x - width, precision_scores, width, label='Precision', alpha=0.8)
    axes[1, 1].bar(x, recall_scores, width, label='Recall', alpha=0.8)
    axes[1, 1].bar(x + width, f1_scores, width, label='F1-Score', alpha=0.8)
    
    axes[1, 1].set_xlabel('Intent Classes')
    axes[1, 1].set_ylabel('Score')
    axes[1, 1].set_title('Per-Class Performance Metrics')
    axes[1, 1].set_xticks(x)
    axes[1, 1].set_xticklabels(classes, rotation=45)
    axes[1, 1].legend()
    axes[1, 1].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed metrics
    print("\\n" + "="*50)
    print("DETAILED MODEL EVALUATION")
    print("="*50)
    print(f"Overall Accuracy: {accuracy_score(y, y_pred):.4f}")
    print(f"Average Confidence: {np.mean(max_probas):.4f}")
    print(f"Low Confidence Predictions (<0.7): {np.sum(max_probas < 0.7)} ({np.sum(max_probas < 0.7)/len(max_probas)*100:.1f}%)")
    
    print("\\nClassification Report:")
    print(classification_report(y, y_pred))

# Evaluate the model
if not processed_data.empty and 'model' in locals():
    evaluate_model(model, processed_data)


In [None]:
# Model Inference and Testing
def predict_intent(model: Pipeline, message: str) -> Dict:
    """
    Predict intent for a single message
    """
    prediction = model.predict([message])[0]
    probabilities = model.predict_proba([message])[0]
    
    # Create probability dictionary
    prob_dict = {
        intent: float(prob) 
        for intent, prob in zip(model.classes_, probabilities)
    }
    
    return {
        'message': message,
        'predicted_intent': prediction,
        'confidence': float(max(probabilities)),
        'all_probabilities': prob_dict
    }

# Test the model with sample messages
if 'model' in locals():
    test_messages = [
        "My internet is very slow today",
        "I can't pay my bill online",
        "The TV channels are not working",
        "How do I reset my password?",
        "I want to upgrade my service",
        "What are your business hours?"
    ]
    
    print("\\n" + "="*60)
    print("MODEL INFERENCE TESTING")
    print("="*60)
    
    for message in test_messages:
        result = predict_intent(model, message)
        print(f"\\nMessage: '{result['message']}'")
        print(f"Predicted Intent: {result['predicted_intent']}")
        print(f"Confidence: {result['confidence']:.4f}")
        print("Top 3 Probabilities:")
        sorted_probs = sorted(result['all_probabilities'].items(), 
                            key=lambda x: x[1], reverse=True)[:3]
        for intent, prob in sorted_probs:
            print(f"  {intent}: {prob:.4f}")
        print("-" * 40)


In [None]:
## Automated Retraining Pipeline

The following section demonstrates how to set up automated retraining using Airflow or similar orchestration tools.


In [None]:
# Automated Retraining Functions
def should_retrain(backend_url: str, threshold_days: int = 7) -> bool:
    """
    Determine if model should be retrained based on new data availability
    """
    try:
        # Check for new feedback data
        end_date = datetime.now()
        start_date = end_date - timedelta(days=threshold_days)
        
        response = requests.get(f"{backend_url}/api/feedback/export", params={
            'format': 'json',
            'start_date': start_date.isoformat(),
            'end_date': end_date.isoformat()
        })
        
        if response.status_code == 200:
            new_data = response.json()
            return len(new_data) > 10  # Retrain if more than 10 new samples
        
        return False
    except Exception as e:
        print(f"Error checking for new data: {e}")
        return False

def automated_training_pipeline():
    """
    Complete automated training pipeline
    """
    print("Starting automated training pipeline...")
    
    # Check if retraining is needed
    if not should_retrain(CONFIG['backend_url']):
        print("No retraining needed - insufficient new data")
        return
    
    try:
        # Fetch fresh data
        print("Fetching fresh training data...")
        fresh_data = fetch_training_data(CONFIG['backend_url'], days_back=60)
        
        if fresh_data.empty:
            print("No data available for training")
            return
        
        # Preprocess data
        print("Preprocessing data...")
        processed_data, stats = preprocess_data(fresh_data)
        
        # Train model
        print("Training model...")
        model, metrics = train_model(processed_data)
        
        # Log to MLflow
        print("Logging to MLflow...")
        log_experiment(model, metrics, stats)
        
        # Save model locally
        import joblib
        model_filename = f"intent_classifier_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pkl"
        joblib.dump(model, model_filename)
        print(f"Model saved as {model_filename}")
        
        print("Automated training pipeline completed successfully!")
        
        return {
            'success': True,
            'model_file': model_filename,
            'metrics': metrics,
            'data_stats': stats
        }
        
    except Exception as e:
        print(f"Error in automated training pipeline: {e}")
        return {'success': False, 'error': str(e)}

# Example of running automated pipeline
print("\\n" + "="*60)
print("AUTOMATED RETRAINING PIPELINE DEMO")
print("="*60)

# Simulate automated retraining check
retrain_needed = should_retrain(CONFIG['backend_url'])
print(f"Retraining needed: {retrain_needed}")

if retrain_needed:
    print("Running automated training pipeline...")
    # result = automated_training_pipeline()
    print("(Pipeline would run here in production)")


In [None]:
## Summary and Next Steps

This notebook provides a complete MLOps pipeline for intent classification:

### What we've built:
1. **Data Pipeline**: Automated data fetching from backend APIs
2. **Feature Engineering**: Text preprocessing and intent labeling
3. **Model Training**: Scikit-learn pipeline with TF-IDF and Logistic Regression
4. **Experiment Tracking**: MLflow integration for reproducibility
5. **Model Evaluation**: Comprehensive metrics and visualizations
6. **Automated Retraining**: Pipeline for continuous model improvement

### Production Deployment:
- Set up MLflow server for experiment tracking
- Create Airflow DAGs for scheduled retraining
- Implement model serving endpoints
- Add model monitoring and drift detection
- Set up automated alerts for model performance degradation

### Model Improvements:
- Experiment with different algorithms (Random Forest, XGBoost, Neural Networks)
- Implement advanced NLP techniques (BERT, sentence transformers)
- Add active learning for better label quality
- Implement ensemble methods for better performance
