In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
filename = '/content/drive/Othercomputers/My Laptop/data-science-assignments/05 - Intro to Machine Learning/insurance.csv'

In [4]:
df = pd.read_csv(filename,header=0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
print(df.isna().sum().sum(), 'missing values')

0 missing values


In [7]:
y = df['charges']
X = df.drop(columns='charges')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
#In this version I have left the random_state = 42,but this causes the
#test score to higher than the train score
#no sure this right

In [9]:
#Making column selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

num_columns = num_selector(X_train)
cat_columns = cat_selector(X_train)

print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['age', 'bmi', 'children']
categorical columns are ['sex', 'smoker', 'region']


In [10]:
#Instantiating transformers
#median_imputer = SimpleImputer(strategy = 'median',copy=False)
#freq_imputer = SimpleImputer(strategy = 'most_frequent',copy=False)
#one_hot_encoder = OneHotEncoder(sparse = False, handle_unknown='error',drop='first')

median_imputer = SimpleImputer(strategy = 'median')

one_hot_encoder = OneHotEncoder(sparse = False, handle_unknown='ignore')

In [11]:
#Instantiate the scaler
scaler = StandardScaler()

In [12]:
#Matching transformers with columns
#impute median of numeric columns
median_tuple = (median_imputer, num_selector)

#impute scaler tuple
scaler_tuple = (scaler, num_selector)

#impute most frequent of categorical columns
#freq_tuple = (freq_imputer, cat_selector)

#one-hot encode categorical columns
ohe_tuple = (one_hot_encoder, cat_selector)

In [13]:
#instantiate column transformer
#give it the steps in the order to apply them
column_transformer = make_column_transformer(median_tuple, scaler_tuple, ohe_tuple)

In [14]:
column_transformer.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('simpleimputer',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='median', verbose=0),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0edbed94d0>),
                                ('standardscaler',
                                 StandardScaler(copy=True, with_mean=True,
                                                with_std=True),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f0edbed94d0>),
                                ('onehotencoder',
                                 OneHotEncoder(categories='auto', dro

In [15]:
X_train_processed = column_transformer.transform(X_train)
X_test_processed = column_transformer.transform(X_test)

In [16]:
reg = LinearRegression()

In [18]:
reg.fit(X_train_processed,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
train_score = reg.score(X_train_processed, y_train)
print(train_score)

0.7449555328228536


In [20]:
test_score = reg.score(X_test_processed, y_test)
print(test_score)

0.7672642952734355
