In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# Step 1: Load the dataset
dataset = pd.read_csv('insurance.csv')

# Step 2: Create a new binary column for high charges
# 1 = High charge (above median), 0 = Low charge
dataset['high_charge'] = (dataset['charges'] > dataset['charges'].median()).astype(int)

# Step 3: Drop the 'charges' column (we now have 'high_charge' instead)
dataset.drop('charges', axis=1, inplace=True)

# Step 4: Convert categorical variables into numeric using one-hot encoding
dataset = pd.get_dummies(dataset, drop_first=True).astype(int)

# Step 5: Split data into features (X) and target (y)
X = dataset.drop('high_charge', axis=1).values
y = dataset['high_charge'].values

print("Independent Variables (X):")
print(X[:5])
print("\nTarget Variable (y):")
print(y[:10])

# Step 6: Encode the target variable 
le = LabelEncoder()
y = le.fit_transform(y)

# Step 7: Split data into training and test sets (75% train, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Step 8: Standardize the feature values
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Step 9: Train the Logistic Regression model
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Step 10: Predict the test results
y_pred = classifier.predict(X_test)

# Step 11: Evaluate the model
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.3f}")

# Step 12: Predict for a new example
# Example input format: [age, bmi, children, sex_male, smoker_yes, region_northwest, region_southeast, region_southwest]
example = [[19, 33, 0, 1, 0, 1, 0, 0]]
prediction = classifier.predict(sc.transform(example))
print("\nPredicted Class for Example (1 = High Charge, 0 = Low Charge):", prediction)

Processed Independent Variables (X):
   age  bmi  children  sex_male  smoker_yes  region_northwest  \
0   19   27         0         0           1                 0   
1   18   33         1         1           0                 0   
2   28   33         3         1           0                 0   
3   33   22         0         1           0                 1   
4   32   28         0         1           0                 1   

   region_southeast  region_southwest  
0                 0                 1  
1                 1                 0  
2                 1                 0  
3                 0                 0  
4                 0                 0  

Dependent Variable (y):
[16884.924    1725.5523   4449.462   21984.47061  3866.8552   3756.6216
  8240.5896   7281.5056   6406.4107  28923.13692]

Predicted vs Actual Charges:
[[11261.66  9724.53]
 [ 9515.29  8547.69]
 [38176.07 45702.02]
 [16303.78 12950.07]
 [ 7073.28  9644.25]
 [ 4003.92  4500.34]
 [ 1637.4   2198.19]
 [14434.