In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
### Clean Insurance Data ###

insurance = pd.read_csv("sets/insurance_raw.csv")

sexes = ["female", "male"]
smoker = ["no", "yes"]
regions = ["northeast", "southeast", "southwest", "northwest"]

# Remove rows with missing values
insurance = insurance.dropna()

# Remove duplicate rows
insurance = insurance.drop_duplicates()

# Remove rows with bad or impossible data
ageFilter = insurance["age"].between(0, 130)
insurance = insurance[ageFilter]

sexFilter = insurance["sex"].isin(sexes)
insurance = insurance[sexFilter]

bmiFilter = insurance["bmi"].between(7, 200)
insurance = insurance[bmiFilter]

smokerFilter = insurance["smoker"].isin(smoker)
insurance = insurance[smokerFilter]

regionFilter = insurance["region"].isin(regions)
insurance = insurance[regionFilter]

costFilter = insurance["charges"].gt(0)
insurance = insurance[costFilter]

# One hot encode some features to put them in more usable format
insurance["sex"].replace(sexes, [0, 1], inplace = True)
insurance["smoker"].replace(smoker, [0, 1], inplace = True)
insurance["region"].replace(regions, [0, 1, 2, 3], inplace = True)

insurance.to_csv("sets/insurance_clean.csv", index = False)

In [21]:
### Split clean data into train and test sets ###

cleanInsurance = pd.read_csv("sets/insurance_clean.csv")

featureNames = list(cleanInsurance.columns)
featureNames.pop()

features = cleanInsurance.loc[:, featureNames]
targets = cleanInsurance["charges"]

trainX, testX, trainY, testY = train_test_split(features, targets, test_size = 0.2, random_state = 0)

trainData = pd.concat([trainX, trainY], axis = 1)
testData = pd.concat([testX, testY], axis = 1)

trainData.to_csv("sets/insurance_train.csv", index = False)
testData.to_csv("sets/insurance_test.csv", index = False)