In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.quantile_regression import QuantReg
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('healthinsurance.csv')

In [None]:
# Preprocessing the data
# - Handling missing values, if any (assumed here no missing values)
# - Convert categorical variables to dummy variables for regression

# Convert categorical columns ('smoker', 'city', 'job_title') to dummy variables
df = pd.get_dummies(df, columns=['smoker', 'city', 'job_title'], drop_first=True)

# Step 3: Split into features (X) and target (y)
# Features include all columns except the target variable ('claim')
features = ['age', 'sex', 'weight', 'bmi', 'hereditary_diseases', 'no_of_dependents', 'bloodpressure', 'diabetes', 'regular_ex',
            'smoker_yes', 'city_2', 'city_3', 'job_title_2', 'job_title_3', 'job_title_4']
target = 'claim'

X = df[features]  # Features
X = sm.add_constant(X)  # Adding a constant term (bias) to the model
y = df[target]  # Target variable (claim)

In [None]:
# Fit the Quantile Regression Models for 50th and 90th quantiles

# 50th quantile (Median) model
model_50th = QuantReg(y, X)
result_50th = model_50th.fit(q=0.50)
print("50th Quantile Regression Summary:")
print(result_50th.summary())

# 90th quantile model
model_90th = QuantReg(y, X)
result_90th = model_90th.fit(q=0.90)
print("90th Quantile Regression Summary:")
print(result_90th.summary())

In [None]:
# Make predictions using the models on the same dataset
y_pred_50th = result_50th.predict(X)
y_pred_90th = result_90th.predict(X)

In [None]:
# Flag outliers where actual values are higher than predictions for both 50th and 90th quantiles
outliers_50th = df[y > y_pred_50th]
outliers_90th = df[y > y_pred_90th]

# Flagging the outliers
df['outlier_50th'] = df[target] > y_pred_50th
df['outlier_90th'] = df[target] > y_pred_90th

In [None]:
# Compare outliers with other data points (Visualization)
plt.figure(figsize=(12, 6))

# Scatter plot of actual vs predicted for 50th quantile
plt.subplot(1, 2, 1)
plt.scatter(y, y_pred_50th, label='Actual vs Predicted (50th Quantile)', color='blue')
plt.xlabel('Actual Claims')
plt.ylabel('Predicted Claims (50th Quantile)')
plt.title('50th Quantile Prediction')
plt.grid(True)

# Scatter plot of actual vs predicted for 90th quantile
plt.subplot(1, 2, 2)
plt.scatter(y, y_pred_90th, label='Actual vs Predicted (90th Quantile)', color='red')
plt.xlabel('Actual Claims')
plt.ylabel('Predicted Claims (90th Quantile)')
plt.title('90th Quantile Prediction')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# Output the flagged outliers
print("Outliers flagged based on 50th Quantile predictions:")
print(outliers_50th)

print("Outliers flagged based on 90th Quantile predictions:")
print(outliers_90th)
