In [12]:
# this is a tutorial from codecademy

In [17]:
# import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [6]:
#load the dataset
dataset = pd.read_csv('insurance.csv')
dataset.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [7]:
#choose first 7 columns as features
features = dataset.iloc[:,0:6] 
features.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast


In [8]:
#choose the final column for prediction
labels = dataset.iloc[:,-1] 
labels.head(2)

0    16884.9240
1     1725.5523
Name: charges, dtype: float64

In [9]:
#print the number of features in the dataset
print("Number of features: ", features.shape[1]) 

Number of features:  6


In [10]:
#print the number of samples in the dataset
print("Number of samples: ", features.shape[0]) 

Number of samples:  1338


In [11]:
#see useful summary statistics for numeric features
print(features.describe()) 

               age          bmi     children
count  1338.000000  1338.000000  1338.000000
mean     39.207025    30.663397     1.094918
std      14.049960     6.098187     1.205493
min      18.000000    15.960000     0.000000
25%      27.000000    26.296250     0.000000
50%      39.000000    30.400000     1.000000
75%      51.000000    34.693750     2.000000
max      64.000000    53.130000     5.000000


In [13]:
#print the number of samples in the labels
print("number of samples: ", labels.shape[0])

#see useful summary statistics for numeric labels
print(labels.describe())

number of samples:  1338
count     1338.000000
mean     13270.422265
std      12110.011237
min       1121.873900
25%       4740.287150
50%       9382.033000
75%      16639.912515
max      63770.428010
Name: charges, dtype: float64


In [15]:
#one-hot encoding for categorical variables
features = pd.get_dummies(features) 
features.head(2)

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1,0,0,1,0,0,0,1
1,18,33.77,1,0,1,1,0,0,0,1,0


In [18]:
#split the data into training and test data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42) 

In [23]:
#normalize the numeric columns using ColumnTransformer
ct = ColumnTransformer([('normalize', Normalizer(), ['age', 'bmi', 'children'])], remainder='passthrough')


In [25]:
#fit the normalizer to the training and testing data and convert from numpy arrays to pandas frame
features_train_norm = ct.fit_transform(features_train) 
features_test_norm = ct.transform(features_test) 


In [26]:
#ColumnTransformer returns numpy arrays. Convert the features to dataframes
features_train_norm = pd.DataFrame(features_train_norm, columns = features_train.columns)
features_test_norm = pd.DataFrame(features_test_norm, columns = features_test.columns)

In [29]:
my_ct = ColumnTransformer([('scale', StandardScaler(), ['age', 'bmi', 'children'])], remainder='passthrough')
features_train_scale = my_ct.fit_transform(features_train)
features_test_scale = my_ct.transform(features_test)

features_train_scale = pd.DataFrame(features_train_scale, columns = features_train.columns)

features_test_scale = pd.DataFrame(features_test_scale, columns = features_test.columns)
#print(features_train_scale.describe())
#print(features_test_scale.describe())
