In [None]:
# -------------------------------
# Step 1: Import Libraries
# -------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

sns.set(style="whitegrid")  # nicer plots
# -------------------------------
# Step 2: Load Dataset
# -------------------------------
from google.colab import files
uploaded = files.upload()  # Upload your insurance.csv file

# Read CSV
df = pd.read_csv("insurance.csv")
print("Dataset Loaded Successfully!\n")
display(df.head())
# -------------------------------
# Step 3: Inspect Dataset
# -------------------------------
print("Shape of dataset:", df.shape)
print("\nColumns:", df.columns)

print("\nDataset Info:")
df.info()

print("\nMissing values per column:")
print(df.isnull().sum())
# -------------------------------
# Step 4: Exploratory Data Analysis (EDA)
# -------------------------------
# Histogram of Charges
plt.figure(figsize=(6,4))
sns.histplot(df['charges'], bins=30, kde=True)
plt.title("Distribution of Insurance Charges")
plt.show()

# Scatter plot: Age vs Charges (colored by smoker)
plt.figure(figsize=(6,4))
sns.scatterplot(x='age', y='charges', hue='smoker', data=df)
plt.title("Age vs Charges (Colored by Smoker)")
plt.show()

# Boxplot: Smoker vs Charges
plt.figure(figsize=(6,4))
sns.boxplot(x='smoker', y='charges', data=df)
plt.title("Smoker vs Charges")
plt.show()
# -------------------------------
# Step 5: Encode Categorical Features
# -------------------------------
# Convert text features to numbers
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

print("\nDataset after encoding:")
display(df.head())
# -------------------------------
# Step 6: Define Features & Target
# -------------------------------
# Features = all columns except 'charges'
X = df.drop('charges', axis=1)

# Target = 'charges' (what we want to predict)
y = df['charges']
# -------------------------------
# Step 7: Split Dataset into Train & Test
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# -------------------------------
# Step 8: Train Linear Regression Model
# -------------------------------
model = LinearRegression()
model.fit(X_train, y_train)
# -------------------------------
# Step 9: Make Predictions
# -------------------------------
y_pred = model.predict(X_test)
# -------------------------------
# Step 10: Evaluate Model
# -------------------------------
# Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
# Root Mean Squared Error
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Mean Absolute Error (MAE):", round(mae,2))
print("Root Mean Squared Error (RMSE):", round(rmse,2))
# -------------------------------
# Step 11: Visualize Predictions vs Actual
# -------------------------------
plt.figure(figsize=(6,4))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("Actual vs Predicted Insurance Charges")
plt.show()
# -------------------------------
# Step 12: Conclusion
# -------------------------------
print("""
Conclusion:
- We trained a Linear Regression model to predict insurance charges.
- Features like age, BMI, smoking status, and gender influence the charges.
- MAE and RMSE show how close predictions are to actual charges.
- Visualization confirms that the model predicts well for most customers,
  though high charges may have slightly higher errors.
""")