# First Model (Practice)
- **Student:** Michael McCann
- **Date:** 28 FEB 2022

## Setup -Mount Drive, Import Libraries and Data

In [3]:
## Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
## Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import set_config
set_config(display='diagram')

In [8]:
## Load and Inspect the Data
insurance_filepath = '/content/drive/MyDrive/Data/insurance.csv'
ins_df = pd.read_csv(insurance_filepath)

ins_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
## No NAs

ins_df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

## Identify Features
Target: 
- Charges

<br>Features:
- Age: Numeric - Scale
- Sex: Object/Categorical
- BMI: Numeric - Scale
- Children: Numeric - Scale
- Smoker: Object/Categorical
- Region: Object/Categorical


## Define Features and Train Test Split

In [16]:
y = ins_df['charges']
X = ins_df.drop(columns = ['charges'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Create the Processor 

In [19]:
cat_sel = make_column_selector(dtype_include = 'object')
num_sel = make_column_selector(dtype_include = 'number')

In [27]:
cat_tuple = (OneHotEncoder(handle_unknown = 'ignore' , sparse = False) , cat_sel)
num_tuple = (StandardScaler() , num_sel)

In [28]:
preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

## Fit the Data

In [29]:
# fit to training data
preprocessor.fit(X_train)

## Transform the Data

In [30]:
# Process for ML
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

## Pull out column names

In [36]:
cat_feat_names = preprocessor.named_transformers_['onehotencoder'].get_feature_names_out(cat_sel(X_train))

In [37]:
final_cols = num_sel(X_train) + list(cat_feat_names)
final_cols

['age',
 'bmi',
 'children',
 'sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest']

## change back into DF

In [38]:
X_train_output = pd.DataFrame(X_train_processed, columns = final_cols)
X_test_output = pd.DataFrame(X_test_processed, columns = final_cols)

In [39]:
print('Training Set DataFrame:')
display(X_train_output.head())
print('\n\n\nTesting Set DataFrame:')
X_test_output.head()

Training Set DataFrame:


Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.087167,-1.140875,-0.9175,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.802106,-0.665842,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.836992,1.528794,-0.086947,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.551932,0.926476,-0.086947,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.480667,-0.268178,0.743605,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0





Testing Set DataFrame:


Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.409402,-0.887967,0.743605,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.231985,-0.081825,-0.9175,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.763439,-0.603447,-0.9175,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.480667,-0.793127,1.574158,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-1.443492,0.234309,-0.9175,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


## Regression Model

In [41]:
# Instantiate Linear Regression as reg
reg = LinearRegression()

In [44]:
# Fit Linear Regression to the model
reg.fit(X_train_output, y_train)

In [45]:
# Get the score...
train_score = reg.score(X_train_output, y_train)
print(f"training set R^2 value: {round(train_score,4)}")

test_score = reg.score(X_test_output, y_test)
print(f"test set R^2 value: {round(test_score, 4)}")

training set R^2 value: 0.745
test set R^2 value: 0.7673
