In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor  

import pandas as pd
import numpy as np
import sklearn.metrics as metrics

In [2]:
# reading our dataframe

dataset = pd.read_csv('data.csv')

In [3]:
# splitting data in training and test data

X, y = dataset.drop(columns=['charges']), dataset['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(f'Size of train data: {len(X_train)}')
print(f'Size of test data: {len(X_test)}')

Size of train data: 936
Size of test data: 402


In [4]:
# transform data using dummies

X_train = pd.get_dummies(X_train, columns=['sex', 'smoker', 'region'])
X_test = pd.get_dummies(X_test, columns=['sex', 'smoker', 'region'])

In [5]:
# Check for zero and near-zero variance variables [TRAIN]
threshold = 0.01 
variances = X_train.var()
low_variance = []

for var, value in variances.items():
    if value <= threshold:
        low_variance.append(var)
        
X_train = X_train.drop(low_variance, axis=1)

In [6]:
# Check for zero and near-zero variance variables [TEST]
threshold = 0.01 
variances = X_test.var()
low_variance = []

for var, value in variances.items():
    if value <= threshold:
        low_variance.append(var)
        
X_test = X_test.drop(low_variance, axis=1)

In [7]:
# Check for highly correlated variables [TRAIN]

# Create correlation matrix
corr_matrix = X_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

# Find index of feature columns with correlation greater than a set threshold
# This threshold is a subjective value depending on the use case
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop highly correlated features 
X_train = X_train.drop(X_train[to_drop], axis=1)

In [8]:
# Check for highly correlated variables [TEST]

# Create correlation matrix
corr_matrix = X_test.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))

# Find index of feature columns with correlation greater than a set threshold
# This threshold is a subjective value depending on the use case
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop highly correlated features 
X_test = X_test.drop(X_test[to_drop], axis=1)

In [9]:
# Check for linear combinations [TRAIN]

x_train_numeric = X_train.select_dtypes(include=[np.number]) 

# Get column rank
x_train_rank = np.linalg.matrix_rank(x_train_numeric.values)

# Compare rank to number of columns 
if x_train_rank == x_train_numeric.shape[1]:
    print("No linear combinations detected")
else:
    print("Linear combinations detected")

No linear combinations detected


In [10]:
# Check for linear combinations [TEST]

x_test_numeric = X_test.select_dtypes(include=[np.number]) 

# Get column rank
x_test_rank = np.linalg.matrix_rank(x_test_numeric.values)

# Compare rank to number of columns 
if x_test_rank == x_test_numeric.shape[1]:
    print("No linear combinations detected")
else:
    print("Linear combinations detected")

No linear combinations detected


In [11]:
# Centering -> Scaling -> Box-Cox transformation [TRAIN]

# Centering and Scaling
scaler = StandardScaler()
scaler.fit(X_train)

train_data = scaler.transform(X_train)
X_train = pd.DataFrame(train_data, columns = X_train.columns)

# Box-Cox Transformation
numeric_features = X_train.select_dtypes(include=[np.number]).columns

for feature in numeric_features:
    X_train[feature] = preprocessing.MinMaxScaler().fit_transform(pd.DataFrame(X_train[feature])) + 1

# Applying Box-Cox Transformation
for feature in numeric_features:
    X_train[feature], _ = boxcox(X_train[feature])

In [12]:
# Centering -> Scaling -> Box-Cox transformation [TEST]

# Centering and Scaling
scaler = StandardScaler()
scaler.fit(X_test)

test_data = scaler.transform(X_test)
X_test = pd.DataFrame(test_data, columns = X_test.columns)

# Box-Cox Transformation
numeric_features = X_test.select_dtypes(include=[np.number]).columns

for feature in numeric_features:
    X_test[feature] = preprocessing.MinMaxScaler().fit_transform(pd.DataFrame(X_test[feature])) + 1

# Applying Box-Cox Transformation
for feature in numeric_features:
    X_test[feature], _ = boxcox(X_test[feature])

In [13]:
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 936 entries, 0 to 935
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               936 non-null    float64
 1   bmi               936 non-null    float64
 2   children          936 non-null    float64
 3   sex_female        936 non-null    float64
 4   smoker_no         936 non-null    float64
 5   region_northeast  936 non-null    float64
 6   region_northwest  936 non-null    float64
 7   region_southeast  936 non-null    float64
 8   region_southwest  936 non-null    float64
dtypes: float64(9)
memory usage: 65.9 KB


In [14]:
# Linear regression model fitting

lr_model = linear_model.LinearRegression()
lr_model.fit(X_train, y_train)
    
lr_model_pred = lr_model.predict(X_test)
mse=metrics.mean_squared_error(y_test, lr_model_pred) 
    
print(f'Linear regression model MSE: {round(mse,4)}')

Linear regression model MSE: 600032299.8115


In [15]:
# Regression tree model fitting

rt_model = DecisionTreeRegressor()
rt_model.fit(X_train, y_train)
    
rt_model_pred = rt_model.predict(X_test)
mse=metrics.mean_squared_error(y_test, rt_model_pred) 
    
print(f'Regression tree model MSE: {round(mse,4)}')

Regression tree model MSE: 50980698.0048
