In [None]:
import pandas as pd

In [3]:
df = pd.read_csv('data/insurance.csv')
print(df.head())
print(df.describe())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [6]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_charges = detect_outliers_iqr(df, 'charges')
print(f"Outliers in 'charges': {len(outliers_charges)}")

print(outliers_charges.head(20))


Outliers in 'charges': 139
     age     sex     bmi  children smoker     region      charges
14    27    male  42.130         0    yes  southeast  39611.75770
19    30    male  35.300         0    yes  southwest  36837.46700
23    34  female  31.920         1    yes  northeast  37701.87680
29    31    male  36.300         2    yes  southwest  38711.00000
30    22    male  35.600         0    yes  southwest  35585.57600
34    28    male  36.400         1    yes  southwest  51194.55914
38    35    male  36.670         1    yes  northeast  39774.27630
39    60    male  39.900         0    yes  southwest  48173.36100
49    36    male  35.200         1    yes  southeast  38709.17600
53    36    male  34.430         0    yes  southeast  37742.57570
55    58    male  36.955         2    yes  northwest  47496.49445
82    22    male  37.620         1    yes  southeast  37165.16380
84    37  female  34.800         2    yes  southwest  39836.51900
86    57  female  31.160         0    yes  northw

All of them are smokers, mostly male and have high bmi, this increase risk of chronic illnesses.

Outliers can inflate the mean, leading to misleading averages and trends.

We can cap outliers because we just want to investigate dataset in general, not especially how smoking affects cost.

In [8]:
missing_counts = df.isnull().sum()
missing_counts

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower_bound, upper_bound)
    return df

df_capped = cap_outliers(df.copy(), 'charges')
