# Medical_insurance.csv

### Read data

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the data
df = pd.read_csv('Medical_insurance.csv')
# Print the first 5 rows
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Preprocess 

In [30]:
# Check for missing values
df.isna().sum()

# Delete the rows with missing values
df = df.dropna()

# Turn the categorical variables into 0 and 1 
df = pd.get_dummies(df, drop_first=True)

# Convert boolean values to 0 and 1
df['smoker_yes'] = df['smoker_yes'].astype(int)
df['sex_male'] = df['sex_male'].astype(int)
df['region_northwest'] = df['region_northwest'].astype(int)
df['region_southeast'] = df['region_southeast'].astype(int)
df['region_southwest'] = df['region_southwest'].astype(int)
df_og = df.copy()

# Move the charges column to the end
charges = df.pop('charges')
df['charges'] = charges
# Print the first 5 rows
df.head()



Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,charges
0,19,27.9,0,0,1,0,0,1,16884.924
1,18,33.77,1,1,0,0,1,0,1725.5523
2,28,33.0,3,1,0,0,1,0,4449.462
3,33,22.705,0,1,0,1,0,0,21984.47061
4,32,28.88,0,1,0,1,0,0,3866.8552


### Normalize data

In [31]:
# Normalize the data to 0 - 1 of the first 8 columns
for column in df.columns[0:8]:
    df[column] = df[column] / df[column].max()

# Scale the charges column to 0.1 - 0.9
df['charges'] = df['charges'] / df['charges'].max() * 0.8 + 0.1

# Print the first 5 rows
df.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,charges
0,0.296875,0.525127,0.0,0.0,1.0,0.0,0.0,1.0,0.311821
1,0.28125,0.635611,0.2,1.0,0.0,0.0,1.0,0.0,0.121647
2,0.4375,0.621118,0.6,1.0,0.0,0.0,1.0,0.0,0.155818
3,0.515625,0.427348,0.0,1.0,0.0,1.0,0.0,0.0,0.375795
4,0.5,0.543572,0.0,1.0,0.0,1.0,0.0,0.0,0.14851


In [32]:
# Output the data to a CSV file
df.to_csv('Medical-insurance-normalized.csv', index=False)

In [33]:
# Unscale data
df['charges'] = (df['charges'] - 0.1) / 0.8 * df_og['charges'].max()

# Unnormalize data
for column in df.columns[0:8]:
    df[column] = df[column] * df_og[column].max()

# Print the first 5 rows
df.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,charges
0,19.0,27.9,0.0,0.0,1.0,0.0,0.0,1.0,16884.924
1,18.0,33.77,1.0,1.0,0.0,0.0,1.0,0.0,1725.5523
2,28.0,33.0,3.0,1.0,0.0,0.0,1.0,0.0,4449.462
3,33.0,22.705,0.0,1.0,0.0,1.0,0.0,0.0,21984.47061
4,32.0,28.88,0.0,1.0,0.0,1.0,0.0,0.0,3866.8552
