In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sales_data ='/content/drive/MyDrive/Week2_data/Machine Learning Data Sets/insurance.csv'
df = pd.read_csv(sales_data)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
# Check for null values
null_values = df.isnull()

# Count the number of null values in each column
null_counts = null_values.sum()

# Display columns with null values, if any
total_null_count = null_counts.sum()
print("Total number of null values:", total_null_count)

Total number of null values: 0


In [11]:
# Define features and target
# Import packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
import missingno
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
# Set pandas as the default output for sklearn
from sklearn import set_config
set_config(transform_output='pandas')
X = df.drop(columns = 'charges')
y = df['charges']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [4]:
X_train.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
dtype: object

In [12]:
#Create a ColumnTransformer to preprocess the data.
#=============================================================================
# Making a numeric tuple for ColumnTransformer
# PREPROCESSING PIPELINE FOR NUMERIC DATA
# Save list of column names
numeric_columns = X_train.select_dtypes("number").columns
# Instantiate the imputer object from the SimpleImputer class with strategy 'median'
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Fit the imputer object on the numeric training data with .fit()
impute_median.fit(X_train[numeric_columns])
# Transform the training data
#X_train_num_imputed = impute_median.transform(X_train[numeric_columns])
X_train_processed = impute_median.transform(X_train[numeric_columns])
# Transfrom the testing data
#X_test_num_imputed = impute_median.transform(X_test[numeric_columns])
X_test_processed = impute_median.transform(X_test[numeric_columns])
#==============================================================================
#Create a OneHotEncoder for one-hot encoding the categorical columns.
# Identify categorical columns
categorical_columns = X_train.select_dtypes(exclude=['number']).columns
#categorical_columns = df.select_dtypes(include=['object']).columns
# Create a OneHotEncoder instance
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the encoder on the categorical columns
encoded_data = encoder.fit_transform(X_train[categorical_columns])

# Convert the encoded data into a DataFrame
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
encoded_df

# Concatenate the encoded DataFrame with the original DataFrame
df_encoded = pd.concat([X_train, encoded_df], axis=1)
df_encoded

# Drop the original categorical columns from the DataFrame
df_encoded.drop(categorical_columns, axis=1, inplace=True)

print("Original x_train_data:")
print(X_train)
print("\nEncoded x_train_data:")
print(df_encoded)
#===============================================================================================================
# Make a numeric preprocessing pipeline
numeric_pipeline = make_pipeline(impute_median, scaler)
numeric_pipeline

# Pipeline 1: For numeric columns
numeric_pipeline = make_pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
numeric_pipeline_tuple = ('numeric_pipeline', numeric_pipeline, numeric_columns)
# Pipeline 2: For categorical columns
categorical_pipeline = make_pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])
categorical_pipeline_tuple = ('categorical_pipeline', categorical_pipeline, categorical_columns)

# Pipeline 3: For both numeric and categorical columns (using ColumnTransformer)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)
column_transformer_tuple = ('Column_Transformer', preprocessor, numeric_columns, categorical_columns)

# Print the tuples
print("Pipeline 1:", numeric_pipeline_tuple)
print("Pipeline 2:", categorical_pipeline_tuple)
print("Pipeline 3:", column_transformer_tuple)

Original x_train_data:
      age     sex     bmi  children smoker     region
693    24    male  23.655         0     no  northwest
1297   28  female  26.510         2     no  southeast
634    51    male  39.700         1     no  southwest
1022   47    male  36.080         1    yes  southeast
178    46  female  28.900         2     no  southwest
...   ...     ...     ...       ...    ...        ...
1095   18  female  31.350         4     no  northeast
1130   39  female  23.870         5     no  southeast
1294   58    male  25.175         0     no  northeast
860    37  female  47.600         2    yes  southwest
1126   55    male  29.900         0     no  southwest

[1003 rows x 6 columns]

Encoded x_train_data:
      age     bmi  children  sex_male  smoker_yes  region_northwest  \
693    24  23.655         0       1.0         0.0               1.0   
1297   28  26.510         2       0.0         0.0               0.0   
634    51  39.700         1       1.0         0.0               0.0 



In [13]:
from sklearn.linear_model import LinearRegression

# Instantiate the LinearRegression model
linreg = LinearRegression()

# Now you can use the model for fitting to your data and making predictions
linreg_pipe = make_pipeline(preprocessor, linreg)
linreg_pipe


In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1003 entries, 693 to 1126
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1003 non-null   int64  
 1   sex       1003 non-null   object 
 2   bmi       1003 non-null   float64
 3   children  1003 non-null   int64  
 4   smoker    1003 non-null   object 
 5   region    1003 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 54.9+ KB


In [10]:
linreg_pipe.fit(X_train, y_train)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. 'Pipeline(steps=[('list',
                 [('imputer', SimpleImputer()), ('scaler', StandardScaler())])])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't.

In [None]:
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

