In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# For data preprocessing and modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Regression Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Performance Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')
print("Libraries loaded successfully!")

Libraries loaded successfully!


In [None]:
# Load the dataset
# The dataset is available on a GitHub repository for easy access
dataset_url = "/content/insurance.csv"
df = pd.read_csv(dataset_url)
print("Dataset loaded successfully!")
print("\nFirst 5 rows of the dataset:")
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '/content/insurance.csv'

In [None]:
print("\nDataset Info:")
df.info()
print("\nStatistical Summary of Numerical Columns:")
print(df.describe())
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Select only numerical columns for correlation calculation
numerical_df = df.select_dtypes(include=np.number)
plt.figure(figsize=(10, 8))
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

In [None]:
# Distribution of charges
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], kde=True, bins=30)
plt.title('Distribution of Medical Charges')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Charges vs. Smoker (categorical - very impactful feature)
plt.figure(figsize=(8, 6))
sns.boxplot(x='smoker', y='charges', data=df)
plt.title('Charges by Smoker Status')
plt.show()

In [None]:
# Charges vs. BMI (numerical)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='charges', data=df, hue='smoker', alpha=0.7)
plt.title('Charges vs. BMI (colored by Smoker)')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.show()

In [None]:
# Charges vs. Age (numerical)
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='charges', data=df, hue='smoker', alpha=0.7)
plt.title('Charges vs. Age (colored by Smoker)')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.show()

In [None]:
# Separate features (X) and target (y)
X = df.drop('charges', axis=1)
y = df['charges']

In [None]:
# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

print(f"Numerical features: {list(numerical_features)}")
print(f"Categorical features: {list(categorical_features)}")

In [None]:
# Create a column transformer for preprocessing
# Numerical features will be scaled using StandardScaler
# Categorical features will be one-hot encoded
preprocessor = ColumnTransformer(
 transformers=[
 ('num', StandardScaler(), numerical_features),
 ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
 ])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split into training and testing sets.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Define a dictionary of regression models
models = {
 'Linear Regression': LinearRegression(),
 'Random Forest Regressor': RandomForestRegressor(random_state=42),
 'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
 'Support Vector Regressor': SVR(),
 'KNeighbors Regressor': KNeighborsRegressor()
}

results = {}

In [None]:
print("Training and evaluating models...\n")

In [None]:
for name, model in models.items():
 print(f"--- Training {name} ---")
 # Create a pipeline that first preprocesses the data then applies the regressor
 pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])

 # Train the model
 pipeline.fit(X_train, y_train)

 # Make predictions on the test set
 y_pred = pipeline.predict(X_test)

 # Calculate performance metrics
 mae = mean_absolute_error(y_test, y_pred)
 mse = mean_squared_error(y_test, y_pred)
 rmse = np.sqrt(mse)
 r2 = r2_score(y_test, y_pred)
 results[name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2 Score': r2}

In [None]:
print(f" MAE: {mae:.2f}")
print(f" MSE: {mse:.2f}")
print(f" RMSE: {rmse:.2f}")
print(f" R-squared (R2): {r2:.2f}")
print("-" * 30 + "\n")
print("All models trained and evaluated.")

In [None]:
# Display all results in a DataFrame for easy comparison
results_df = pd.DataFrame(results).T # Transpose to have models as rows
results_df = results_df.sort_values(by='R2 Score', ascending=False) # Sort by R2 for comparison

print("\n--- Model Performance Comparison ---")
print(results_df)

In [None]:
# Visualize R2 Score
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y='R2 Score', data=results_df, palette='viridis')
plt.title('R-squared Score Comparison Across Models')
plt.xlabel('Model')
plt.ylabel('R-squared Score')
plt.ylim(0, 1) # R2 score typically ranges from 0 to 1
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Visualize RMSE
plt.figure(figsize=(12, 6))
sns.barplot(x=results_df.index, y='RMSE', data=results_df, palette='magma')
plt.title('RMSE Comparison Across Models')
plt.xlabel('Model')
plt.ylabel('RMSE')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Choose the best model based on the results, e.g., Random Forest
best_model_name = results_df.index[0] # Get the name of the top performing model
best_model = models[best_model_name]
print(f"\n--- Selected Best Model: {best_model_name} ---")

In [None]:
# Retrain the best model on the entire dataset (X, y) for production use
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
 ('regressor', best_model)])
final_pipeline.fit(X, y) # Fit on the full dataset
print(f"{best_model_name} retrained on the full dataset.")

In [None]:
# Example: Make a prediction for a new, hypothetical customer
# Let's create a new customer's data as a pandas DataFrame
new_customer_data = pd.DataFrame([[30, 'male', 28.5, 1, 'northwest', 'no']],
 columns=['age', 'sex', 'bmi', 'children', 'region', 'smoker'])
predicted_charges = final_pipeline.predict(new_customer_data)
print(f"\nPredicted charges for the new customer: ${predicted_charges[0]:.2f}")

In [None]:
# Example 2: A smoker with higher BMI
new_customer_data_smoker = pd.DataFrame([[45, 'female', 35.0, 2, 'southeast', 'yes']],
 columns=['age', 'sex', 'bmi', 'children', 'region','smoker'])
predicted_charges_smoker = final_pipeline.predict(new_customer_data_smoker)
print(f"Predicted charges for a new smoker customer: ${predicted_charges_smoker[0]:.2f}")