In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [37]:
### Clean Insurance Data ###
insurance = pd.read_csv("sets/insurance_raw.csv")

sexes = ["female", "male"]
smoker = ["no", "yes"]
regions = ["northeast", "southeast", "southwest", "northwest"]

# Remove rows with missing values
insurance = insurance.dropna()

# Remove duplicate rows
insurance = insurance.drop_duplicates()

# Remove rows with bad or impossible data
ageFilter = insurance["age"].between(0, 130)
insurance = insurance[ageFilter]

sexFilter = insurance["sex"].isin(sexes)
insurance = insurance[sexFilter]

bmiFilter = insurance["bmi"].between(7, 200)
insurance = insurance[bmiFilter]

smokerFilter = insurance["smoker"].isin(smoker)
insurance = insurance[smokerFilter]

regionFilter = insurance["region"].isin(regions)
insurance = insurance[regionFilter]

costFilter = insurance["charges"].gt(0)
insurance = insurance[costFilter]

# One hot encode some features to put them in more usable format
insurance["sex"].replace(sexes, [0, 1], inplace = True)
insurance["smoker"].replace(smoker, [0, 1], inplace = True)
insurance["region"].replace(regions, [0, 1, 2, 3], inplace = True)

insurance.to_csv("sets/insurance_clean.csv", index = False)

In [38]:
### Fully encode data (previous encoding was useful for data exploration) ###

cleanInsurance = pd.read_csv("sets/insurance_clean.csv")
cleanInsurance["region"].replace([0, 1, 2, 3], regions, inplace = True)

expandedFeatures = pd.get_dummies(cleanInsurance)
regionColumns = expandedFeatures.columns[-4:]
splitRegions = expandedFeatures.loc[:, regionColumns]

temp = cleanInsurance.drop(columns = ["region", "charges"])
encodedInsurance = pd.concat([temp, splitRegions, cleanInsurance["charges"]], axis = 1)

encodedInsurance.to_csv("sets/insurance_encoded.csv", index = False)

In [39]:
### Scale encoded features for use in unsupervised learning ###

encodedInsurance = pd.read_csv("sets/insurance_encoded.csv")

numerical = ["age", "bmi", "children"]
numericalData = encodedInsurance[numerical]
categorical = ["sex", "smoker", "region_northeast", "region_northwest", "region_southeast", "region_southwest"]
categoricalData = encodedInsurance[categorical]

scaler = StandardScaler()
scaledNumerical = scaler.fit_transform(numericalData)
numericalData = pd.DataFrame(scaledNumerical)
numericalData.columns = numerical

scaledFeatures = pd.concat([numericalData, categoricalData], axis = 1)
scaledFeatures.to_csv("sets/insurance_features_scaled.csv", index = False)

In [40]:
### Split encoded data into train and test sets ###

encodedInsurance = pd.read_csv("sets/insurance_encoded.csv")

featureNames = list(encodedInsurance.columns)
featureNames.pop()

features = encodedInsurance.loc[:, featureNames]
targets = encodedInsurance["charges"]

trainX, testX, trainY, testY = train_test_split(features, targets, test_size = 0.2, random_state = 0)

trainData = pd.concat([trainX, trainY], axis = 1)
testData = pd.concat([testX, testY], axis = 1)

trainData.to_csv("sets/insurance_train.csv", index = False)
testData.to_csv("sets/insurance_test.csv", index = False)