<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week06/first_model_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
# Imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [35]:
# Load the data
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSS6a6R8L2_w9VF1c7KC47LEHwc0rjLSP9AiSsj0OsfW74_mBortNik4AokUkQceUblTiiQyMFmukSo/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
# Check for duplicates and missing values
df.duplicated().sum()

# Remove the duplicate
df.drop_duplicates(inplace = True)

# Check for missing values
df.isna().sum().sum()

# List the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [23]:
# Set the features and target
y = df['charges']
X = df.drop(columns = 'charges')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [24]:
# Make the column selectors
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

In [28]:
# Instantiate the encoders
scaler = StandardScaler()
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

In [31]:
# Construct tuples for the column transformer
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

In [39]:
# Construct the column transformer
column_transformer = make_column_transformer(num_tuple, cat_tuple)

In [36]:
# Instantiate the linear regression model
lin_reg = LinearRegression()

In [45]:
# Build the pipeline
reg_pipe = make_pipeline(column_transformer, lin_reg)

In [46]:
# Train the model
reg_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fdb0ea5d990>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fdb0ea5d750>)])),
                ('linearregression', LinearRegression())])

In [53]:
# Make predictions
train_pred = reg_pipe.predict(X_train)
test_pred = reg_pipe.predict(X_test)

In [61]:
# Check the performance of the model
X_train_MSE = mean_squared_error(y_train, train_pred)
X_test_MSE = mean_squared_error(y_test, test_pred)
print(X_train_MSE, X_test_MSE)

37182283.84844755 35290411.81217839


In [60]:
# Get the R2 score
X_train_r2 = r2_score(y_train, train_pred)
X_test_r2 = r2_score(y_test, test_pred)
print(X_train_r2, X_test_r2)

0.7297484324637916 0.795902783953728
