# Linear Regression Modeling Template

Focused template for linear regression analysis using the Data Analysis and Prediction Platform.

## Objectives
- Build and evaluate linear regression models
- Understand feature relationships
- Validate model assumptions
- Export model for production use

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import requests
import joblib
from pathlib import Path

plt.style.use('default')
sns.set_palette("viridis")

# Platform API configuration
API_BASE_URL = "http://localhost:8000"
print("🚀 Linear Regression Template Ready")

## Data Loading

In [None]:
# Load your dataset here
# Replace with your actual data loading code

# Example: Load via API
# file_id = "your-file-id"
# response = requests.get(f"{API_BASE_URL}/data/{file_id}")
# df = pd.read_csv(response.json()['file_path'])

# Example: Generate sample data
np.random.seed(42)
n = 200
X_sample = np.random.randn(n, 3)
y_sample = 2 * X_sample[:, 0] + 3 * X_sample[:, 1] - 1.5 * X_sample[:, 2] + np.random.randn(n) * 0.5

df = pd.DataFrame({
    'feature_1': X_sample[:, 0],
    'feature_2': X_sample[:, 1], 
    'feature_3': X_sample[:, 2],
    'target': y_sample
})

print(f"Dataset loaded: {df.shape}")
df.head()

## Feature Selection and Preparation

In [None]:
# Define features and target
target_col = 'target'  # Adjust as needed
feature_cols = [col for col in df.columns if col != target_col]

X = df[feature_cols]
y = df[target_col]

print(f"Features: {feature_cols}")
print(f"Target: {target_col}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

In [None]:
# Check for multicollinearity
correlation_matrix = X.corr()

plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

# Highlight high correlations
high_corr = np.where(np.abs(correlation_matrix) > 0.8)
high_corr_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y], correlation_matrix.iloc[x, y]) 
                   for x, y in zip(*high_corr) if x != y and x < y]

if high_corr_pairs:
    print("⚠️ High correlations detected:")
    for feat1, feat2, corr in high_corr_pairs:
        print(f"  {feat1} - {feat2}: {corr:.3f}")
else:
    print("✅ No concerning multicollinearity detected")

## Data Splitting and Scaling

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Optional: Scale features (useful for regularized models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Data scaled and ready for modeling")

## Model Training and Comparison

In [None]:
# Train multiple regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1)
}

model_results = {}

for name, model in models.items():
    if 'Ridge' in name or 'Lasso' in name:
        # Use scaled data for regularized models
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
    else:
        # Use original data for linear regression
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    
    model_results[name] = {
        'model': model,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_mse': train_mse,
        'test_mse': test_mse,
        'test_mae': test_mae,
        'y_pred_test': y_pred_test
    }
    
    print(f"\n{name}:")
    print(f"  Train R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Test MSE: {test_mse:.4f}")
    print(f"  Test MAE: {test_mae:.4f}")

print("\n✅ All models trained")

In [None]:
# Model comparison table
comparison_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'Train_R2': [results['train_r2'] for results in model_results.values()],
    'Test_R2': [results['test_r2'] for results in model_results.values()],
    'Test_MSE': [results['test_mse'] for results in model_results.values()],
    'Test_MAE': [results['test_mae'] for results in model_results.values()]
})

comparison_df['Overfitting'] = comparison_df['Train_R2'] - comparison_df['Test_R2']

print("📊 Model Comparison:")
comparison_df.round(4)

## Model Evaluation and Diagnostics

In [None]:
# Select best model based on test R²
best_model_name = comparison_df.loc[comparison_df['Test_R2'].idxmax(), 'Model']
best_model = model_results[best_model_name]['model']
best_predictions = model_results[best_model_name]['y_pred_test']

print(f"🏆 Best Model: {best_model_name}")
print(f"Test R²: {model_results[best_model_name]['test_r2']:.4f}")

In [None]:
# Residual analysis
residuals = y_test - best_predictions

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Predicted vs Actual
axes[0, 0].scatter(y_test, best_predictions, alpha=0.6)
axes[0, 0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('Actual Values')
axes[0, 0].set_ylabel('Predicted Values')
axes[0, 0].set_title('Predicted vs Actual')
axes[0, 0].grid(True, alpha=0.3)

# 2. Residuals vs Predicted
axes[0, 1].scatter(best_predictions, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residuals vs Predicted')
axes[0, 1].grid(True, alpha=0.3)

# 3. Histogram of residuals
axes[1, 0].hist(residuals, bins=20, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Residuals')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Residuals')
axes[1, 0].grid(True, alpha=0.3)

# 4. Q-Q plot for normality
stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot (Normality Check)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical tests
_, p_value_normality = stats.shapiro(residuals)
print(f"\n📊 Residual Analysis:")
print(f"Mean residual: {residuals.mean():.6f}")
print(f"Std residual: {residuals.std():.4f}")
print(f"Normality test p-value: {p_value_normality:.4f}")

if p_value_normality > 0.05:
    print("✅ Residuals appear normally distributed")
else:
    print("⚠️ Residuals may not be normally distributed")

In [None]:
# Feature importance (for linear regression)
if best_model_name == 'Linear Regression':
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Coefficient': best_model.coef_,
        'Abs_Coefficient': np.abs(best_model.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if coef < 0 else 'blue' for coef in feature_importance['Coefficient']]
    bars = plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors, alpha=0.7)
    plt.xlabel('Coefficient Value')
    plt.title('Feature Coefficients (Red=Negative, Blue=Positive)')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("📊 Feature Coefficients:")
    print(feature_importance)
    print(f"\nIntercept: {best_model.intercept_:.4f}")

## Cross-Validation

In [None]:
# Perform cross-validation
cv_scores = {}

for name, model in models.items():
    if 'Ridge' in name or 'Lasso' in name:
        scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    else:
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    
    cv_scores[name] = scores
    print(f"{name}:")
    print(f"  CV R² scores: {scores}")
    print(f"  Mean CV R²: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    print()

# Plot CV scores
plt.figure(figsize=(10, 6))
cv_data = [cv_scores[name] for name in cv_scores.keys()]
plt.boxplot(cv_data, labels=cv_scores.keys())
plt.ylabel('R² Score')
plt.title('Cross-Validation Performance Comparison')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Model Export and API Integration

In [None]:
# Save the best model
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_filename = f"linear_regression_model_{timestamp}.joblib"
model_path = Path("../data/processed/models") / model_filename
model_path.parent.mkdir(parents=True, exist_ok=True)

# Save model and scaler
model_package = {
    'model': best_model,
    'scaler': scaler if 'Ridge' in best_model_name or 'Lasso' in best_model_name else None,
    'feature_columns': feature_cols,
    'model_name': best_model_name,
    'metrics': model_results[best_model_name]
}

joblib.dump(model_package, model_path)
print(f"✅ Model saved to: {model_path}")

# Create model metadata
model_metadata = {
    'model_id': str(uuid.uuid4()),
    'model_type': 'linear_regression',
    'model_name': best_model_name,
    'file_path': str(model_path),
    'created_at': datetime.now().isoformat(),
    'features': feature_cols,
    'target': target_col,
    'performance': {
        'test_r2': float(model_results[best_model_name]['test_r2']),
        'test_mse': float(model_results[best_model_name]['test_mse']),
        'test_mae': float(model_results[best_model_name]['test_mae'])
    },
    'training_samples': len(X_train),
    'test_samples': len(X_test)
}

print("📊 Model Metadata:")
print(json.dumps(model_metadata, indent=2))

In [None]:
# Example: Register model with platform API (uncomment when API is available)
# try:
#     response = requests.post(
#         f"{API_BASE_URL}/models/register",
#         json=model_metadata,
#         headers={"Content-Type": "application/json"}
#     )
#     if response.status_code == 200:
#         print(f"✅ Model registered with API: {response.json()}")
#     else:
#         print(f"❌ Failed to register model: {response.status_code}")
# except Exception as e:
#     print(f"⚠️ Could not connect to API: {e}")

print("📝 Model ready for production deployment")

## Prediction Function

In [None]:
def make_prediction(new_data, model_package):
    """Make predictions using the trained model."""
    model = model_package['model']
    scaler = model_package['scaler']
    feature_cols = model_package['feature_columns']
    
    # Ensure correct feature order
    if isinstance(new_data, dict):
        new_data = pd.DataFrame([new_data])
    
    new_data = new_data[feature_cols]
    
    # Scale if necessary
    if scaler is not None:
        new_data_scaled = scaler.transform(new_data)
        prediction = model.predict(new_data_scaled)
    else:
        prediction = model.predict(new_data)
    
    return prediction

# Example prediction
sample_input = {col: X_test.iloc[0][col] for col in feature_cols}
prediction = make_prediction(sample_input, model_package)

print(f"📊 Sample Prediction:")
print(f"Input: {sample_input}")
print(f"Predicted: {prediction[0]:.4f}")
print(f"Actual: {y_test.iloc[0]:.4f}")
print(f"Error: {abs(prediction[0] - y_test.iloc[0]):.4f}")

## Summary

### Model Performance
- **Best Model**: {best_model_name}
- **Test R²**: {model_results[best_model_name]['test_r2']:.4f}
- **Test MSE**: {model_results[best_model_name]['test_mse']:.4f}
- **Test MAE**: {model_results[best_model_name]['test_mae']:.4f}

### Key Findings
- Dataset size: {len(df)} samples, {len(feature_cols)} features
- Model explains {model_results[best_model_name]['test_r2']*100:.1f}% of variance in test data
- Residuals analysis: [Add your interpretation]

### Next Steps
1. Deploy model using platform API
2. Monitor model performance in production
3. Consider feature engineering for improvement
4. Collect new data for model retraining

---
*Linear Regression Analysis completed using DAPP*