In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer

In [17]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
filename = '/content/drive/Othercomputers/My Laptop/data-science-assignments/05 - Intro to Machine Learning/insurance.csv'

In [None]:
df = pd.read_csv(filename,header=0)
df.head()

In [None]:
df.info()

In [None]:
print(df.isna().sum().sum(), 'missing values')

In [88]:
y = df['charges']
X = df.drop(columns='charges')

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)

print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

In [None]:
df_num = df[num_columns]
df_num.loc[:, df_num.isna().any()]

In [None]:
#Instantiate the imputer object from the SimpleImputer class with strategy 'median'
median_imputer = SimpleImputer(strategy='median')
#Fit the imputer object on the numeric training data with .fit() 
#calculates the medians of the columns in the training set
median_imputer.fit(X_train[num_columns])
#Use the mean from the training data to fill the missing values in 
#the numeric columns of both the training and testing sets with .transform()
X_train.loc[:, num_columns] = median_imputer.transform(X_train[num_columns])
X_test.loc[:, num_columns] = median_imputer.transform(X_test[num_columns])

In [None]:
#instantiate a SimpleImputer to fill missing data with the most frequent value
freq_imputer = SimpleImputer(strategy='most_frequent')
#fit on the categorical columns of the training data
freq_imputer.fit(X_train[cat_columns])
#transform the categorical columns of both the training and testing data
X_train.loc[:, cat_columns] = freq_imputer.transform(X_train[cat_columns])
X_test.loc[:, cat_columns] = freq_imputer.transform(X_test[cat_columns])

In [None]:
print(df.isna().sum().sum(), 'missing values')
df.info()

In [None]:
print('The numerical features are', num_columns,".")
print('The ordinal features is smoker.')
print('The nominal features are sex and region.')

In [None]:
#Not required but changing the ordinal feature of smoker to numeric
df['smoker'].value_counts()
smoking = {'no':0,'yes':1}
X_train['smoker'] = X_train['smoker'].replace(smoking)

In [None]:
#Setting the One Hot Encode for the region and sex features
X_train['northeast_region'] = X_train['region'].replace({'northeast':1,'northwest':0,'southeast':0,'southwest':0})
X_train['northwest_region'] = X_train['region'].replace({'northeast':0,'northwest':1,'southeast':0,'southwest':0})
X_train['southeast_region'] = X_train['region'].replace({'northeast':0,'northwest':0,'southeast':1,'southwest':0})
X_train['southwest_region'] = X_train['region'].replace({'northeast':0,'northwest':0,'southeast':0,'southwest':1})
X_train['female_sex'] = X_train['sex'].replace({'female':1,'male':0})
X_train['male_sex'] = X_train['sex'].replace({'female':0,'male':1})