In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Create directories
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

# Load YOUR dataset
print("Loading Medicalpremium.csv...")
df = pd.read_csv(r"C:\Users\Mr. Louis Obadiah\Desktop\new_project\Medicalpremium.csv")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 5 rows:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Basic statistics
print("\nDataset statistics:")
print(df.describe())

# Check target distribution
print(f"\nTarget 'PremiumPrice' statistics:")
print(f"Min: {df['PremiumPrice'].min()}")
print(f"Max: {df['PremiumPrice'].max()}")
print(f"Mean: {df['PremiumPrice'].mean():.2f}")
print(f"Std: {df['PremiumPrice'].std():.2f}")

# Visualize premium distribution
plt.figure(figsize=(10, 6))
plt.hist(df['PremiumPrice'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Premium Price')
plt.ylabel('Frequency')
plt.title('Distribution of Premium Prices')
plt.grid(True, alpha=0.3)
plt.savefig('plots/premium_distribution.png')
plt.close()

# Check correlation
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('plots/correlation_matrix.png')
plt.close()

print("\nüìä Correlation with PremiumPrice:")
correlation_with_target = df.corr()['PremiumPrice'].sort_values(ascending=False)
print(correlation_with_target)

# Prepare features and target
X = df.drop('PremiumPrice', axis=1)
y = df['PremiumPrice']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nüìà Data Split:")
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(X.columns.tolist(), 'models/feature_columns.pkl')

# Initialize results dictionary
results = {}

# ==================== MODEL 1: LINEAR REGRESSION ====================
print("\n" + "="*50)
print("MODEL 1: LINEAR REGRESSION")
print("="*50)

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)

# Calculate metrics
lr_metrics = {
    'R2': r2_score(y_test, y_pred_lr),
    'MAE': mean_absolute_error(y_test, y_pred_lr),
    'MSE': mean_squared_error(y_test, y_pred_lr),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_lr))
}

results['Linear Regression'] = lr_metrics

print(f"R¬≤ Score: {lr_metrics['R2']:.4f}")
print(f"MAE: {lr_metrics['MAE']:.2f}")
print(f"MSE: {lr_metrics['MSE']:.2f}")
print(f"RMSE: {lr_metrics['RMSE']:.2f}")

# Save model
joblib.dump(lr_model, 'models/linear_regression_model.pkl')
print("‚úÖ Linear Regression model saved!")

# ==================== MODEL 2: RANDOM FOREST ====================
print("\n" + "="*50)
print("MODEL 2: RANDOM FOREST REGRESSOR")
print("="*50)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate metrics
rf_metrics = {
    'R2': r2_score(y_test, y_pred_rf),
    'MAE': mean_absolute_error(y_test, y_pred_rf),
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf))
}

results['Random Forest'] = rf_metrics

print(f"R¬≤ Score: {rf_metrics['R2']:.4f}")
print(f"MAE: {rf_metrics['MAE']:.2f}")
print(f"MSE: {rf_metrics['MSE']:.2f}")
print(f"RMSE: {rf_metrics['RMSE']:.2f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüîç Feature Importance (Random Forest):")
print(feature_importance)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'][:10], feature_importance['importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importance - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('plots/feature_importance.png')
plt.close()

# Save model
joblib.dump(rf_model, 'models/random_forest_model.pkl')
print("‚úÖ Random Forest model saved!")

# ==================== MODEL 3: GRADIENT BOOSTING (Alternative to ANN) ====================
print("\n" + "="*50)
print("MODEL 3: GRADIENT BOOSTING REGRESSOR")
print("="*50)

from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

gb_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_gb = gb_model.predict(X_test_scaled)

# Calculate metrics
gb_metrics = {
    'R2': r2_score(y_test, y_pred_gb),
    'MAE': mean_absolute_error(y_test, y_pred_gb),
    'MSE': mean_squared_error(y_test, y_pred_gb),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_gb))
}

results['Gradient Boosting'] = gb_metrics

print(f"R¬≤ Score: {gb_metrics['R2']:.4f}")
print(f"MAE: {gb_metrics['MAE']:.2f}")
print(f"MSE: {gb_metrics['MSE']:.2f}")
print(f"RMSE: {gb_metrics['RMSE']:.2f}")

# Save model
joblib.dump(gb_model, 'models/gradient_boosting_model.pkl')
print("‚úÖ Gradient Boosting model saved!")

# ==================== MODEL 4: ARTIFICIAL NEURAL NETWORK (ANN) ====================

# ANN model section
print("\n" + "="*50)
print("MODEL 4: ARTIFICIAL NEURAL NETWORK")
print("="*50)

ann_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])

ann_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train
history = ann_model.fit(
    X_train_scaled, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    verbose=0
)

# Predict and evaluate
y_pred_ann = ann_model.predict(X_test_scaled).flatten()

ann_metrics = {
    'R2': r2_score(y_test, y_pred_ann),
    'MAE': mean_absolute_error(y_test, y_pred_ann),
    'MSE': mean_squared_error(y_test, y_pred_ann),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_ann))
}

results['Neural Network'] = ann_metrics

# Save model
ann_model.save('models/ann_model.h5')

# ==================== MODEL COMPARISON ====================
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

comparison_df = pd.DataFrame(results).T
print("\nüìä Performance Comparison:")
print(comparison_df.round(4))

# Visual comparison
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# R¬≤ Score comparison
axes[0, 0].bar(comparison_df.index, comparison_df['R2'], color=['blue', 'green', 'orange'])
axes[0, 0].set_title('R¬≤ Score Comparison')
axes[0, 0].set_ylabel('R¬≤ Score')
axes[0, 0].set_ylim(0, 1)
for i, v in enumerate(comparison_df['R2']):
    axes[0, 0].text(i, v + 0.02, f'{v:.3f}', ha='center')

# RMSE comparison
axes[0, 1].bar(comparison_df.index, comparison_df['RMSE'], color=['blue', 'green', 'orange'])
axes[0, 1].set_title('RMSE Comparison')
axes[0, 1].set_ylabel('RMSE')
for i, v in enumerate(comparison_df['RMSE']):
    axes[0, 1].text(i, v + 50, f'{v:.0f}', ha='center')

# MAE comparison
axes[1, 0].bar(comparison_df.index, comparison_df['MAE'], color=['blue', 'green', 'orange'])
axes[1, 0].set_title('MAE Comparison')
axes[1, 0].set_ylabel('MAE')
for i, v in enumerate(comparison_df['MAE']):
    axes[1, 0].text(i, v + 50, f'{v:.0f}', ha='center')

# Actual vs Predicted for Random Forest (best model)
axes[1, 1].scatter(y_test, y_pred_rf, alpha=0.5)
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1, 1].set_xlabel('Actual Premium')
axes[1, 1].set_ylabel('Predicted Premium')
axes[1, 1].set_title('Actual vs Predicted (Random Forest)')

plt.tight_layout()
plt.savefig('plots/model_comparison.png')
plt.close()

# ==================== SAMPLE PREDICTIONS ====================
print("\n" + "="*60)
print("SAMPLE PREDICTIONS")
print("="*60)

# Create some test samples
sample_data = [
    # Young healthy person
    [25, 0, 0, 0, 0, 175, 70, 0, 0, 0],
    # Middle-aged with health issues
    [45, 1, 1, 0, 1, 170, 85, 1, 0, 2],
    # Older person with transplants
    [60, 0, 1, 1, 0, 165, 75, 0, 1, 1]
]

sample_df = pd.DataFrame(sample_data, columns=X.columns)

# Scale the samples
sample_scaled = scaler.transform(sample_df)

# Make predictions with all models
for i, sample in enumerate(sample_data):
    print(f"\nSample {i+1}:")
    print(f"  Age: {sample[0]}, Diabetes: {sample[1]}, BP Problems: {sample[2]}")
    print(f"  Transplant: {sample[3]}, Chronic Disease: {sample[4]}")
    print(f"  Height: {sample[5]}cm, Weight: {sample[6]}kg")
    print(f"  Allergies: {sample[7]}, Cancer History: {sample[8]}, Surgeries: {sample[9]}")
    
    pred_lr = lr_model.predict(sample_scaled[i:i+1])[0]
    pred_rf = rf_model.predict(sample_scaled[i:i+1])[0]
    pred_gb = gb_model.predict(sample_scaled[i:i+1])[0]
    
    print(f"  LR Prediction: ‚Çπ{pred_lr:,.2f}")
    print(f"  RF Prediction: ‚Çπ{pred_rf:,.2f}")
    print(f"  GB Prediction: ‚Çπ{pred_gb:,.2f}")

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)
print("\nCreated files:")
print("1. models/linear_regression_model.pkl")
print("2. models/random_forest_model.pkl")
print("3. models/gradient_boosting_model.pkl")
print("4. models/scaler.pkl")
print("5. models/feature_columns.pkl")
print("\nVisualizations saved in 'plots/' folder")
print("\nNow you can run: python predict.py")




Loading Medicalpremium.csv...
Dataset shape: (986, 11)
Columns: ['Age', 'Diabetes', 'BloodPressureProblems', 'AnyTransplants', 'AnyChronicDiseases', 'Height', 'Weight', 'KnownAllergies', 'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries', 'PremiumPrice']

First 5 rows:
   Age  Diabetes  BloodPressureProblems  AnyTransplants  AnyChronicDiseases  \
0   45         0                      0               0                   0   
1   60         1                      0               0                   0   
2   36         1                      1               0                   0   
3   52         1                      1               0                   1   
4   38         0                      0               0                   1   

   Height  Weight  KnownAllergies  HistoryOfCancerInFamily  \
0     155      57               0                        0   
1     180      73               0                        0   
2     158      59               0                        0   
3     

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 58ms/step


