In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy import stats

# Load dataset
file_path = "insurance.csv"
data = pd.read_csv(file_path)

# Display basic info
print("\n📌 Dataset Information:")
print(data.info())

# Summary statistics
print("\n📊 Summary Statistics:")
print(data.describe())

# Check for missing values
missing_values = data.isnull().sum()
if (missing_values > 0).any():
    print("\n⚠️ Missing Values Found:")
    print(missing_values[missing_values > 0])
else:
    print("\n✅ No Missing Values Found.")

# Handling missing values (Fill numeric with median, categorical with mode)
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col] = data[col].fillna(data[col].median())  # Numeric: Fill with median

for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])  # Categorical: Fill with mode

print("\n✅ Missing values handled.")

# Encoding categorical variables
categorical_cols = data.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

print("\n🔠 Categorical variables encoded.")

# Calculate skewness for numerical features
numeric_cols = data.select_dtypes(include=["number"]).columns
skewness = data[numeric_cols].skew()
print("\n📈 Skewness of Features:")
print(skewness)

# Correlation Analysis
correlation_matrix = data[numeric_cols].corr()
print("\n🔗 Top Correlated Features:")
top_corr = correlation_matrix.unstack().sort_values(ascending=False)
print(top_corr[top_corr < 1].head(10))  # Exclude self-correlation (1.0)

# Handle outliers using Z-score method (removing beyond 3 standard deviations)
z_scores = np.abs(stats.zscore(data[numeric_cols]))
data_cleaned = data[(z_scores < 3).all(axis=1)].copy()  # Add .copy() to create a new DataFrame
outliers_removed = len(data) - len(data_cleaned)
print(f"\n🧹 Removed {outliers_removed} outliers based on Z-score filtering.")

# Feature Scaling
scaler = StandardScaler()
data_cleaned[numeric_cols] = scaler.fit_transform(data_cleaned[numeric_cols])
print("\n📏 Numerical features standardized.")

# Save the cleaned dataset
data_cleaned.to_csv("insurance_cleaned.csv", index=False)
print("\n✅ Data processing complete. Cleaned dataset saved as 'insurance_cleaned.csv'.")



📌 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Name                       10000 non-null  object 
 1   Age                        10000 non-null  int64  
 2   Gender                     10000 non-null  object 
 3   BMI                        10000 non-null  float64
 4   Smoking Status             10000 non-null  object 
 5   Region                     10000 non-null  object 
 6   Diabetes                   10000 non-null  int64  
 7   Hypertension               10000 non-null  int64  
 8   Heart Disease              10000 non-null  int64  
 9   Cancer History             10000 non-null  int64  
 10  Stroke                     10000 non-null  int64  
 11  Liver Disease              10000 non-null  int64  
 12  Kidney Disease             10000 non-null  int64  
 13  COPD                   