Goal: Improve Linear model using sfs and standardization of the target feature together.

In [5]:
##Load Modules
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn import set_config
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

In [6]:
##Load data, first inspection, create dummies for RegionCode, Destination, HouseHoldSize
reg = pd.read_csv('C:/Users/morel/Downloads/Training Data 7-28-2025.csv')
reg = pd.get_dummies(reg, columns=['RegionCode','Destination','HHSize'], dtype=int)
print(reg.head())

   FamilyID  ResidentID  Bednights  HHEmployed  HHTANF  Slotted  SlottedCCM  \
0       243         957        957           1       0        0           0   
1       243         958        957           1       0        0           0   
2       243         959        957           1       0        0           0   
3       243         960        957           1       0        0           0   
4       242         973        944           1       0        1           1   

   SlottedPME  SingleMom  SingleDad  ...  Destination_10  Destination_11  \
0           0          1          0  ...               0               0   
1           0          1          0  ...               0               0   
2           0          1          0  ...               0               0   
3           0          1          0  ...               0               0   
4           0          0          0  ...               0               0   

   HHSize_2  HHSize_3  HHSize_4  HHSize_5  HHSize_6  HHSize_7  HHSiz

In [7]:
##EDA: no missing values expected. No duplicates expected.
print(reg.isna().sum(), reg.duplicated().any())

FamilyID          0
ResidentID        0
Bednights         0
HHEmployed        0
HHTANF            0
Slotted           0
SlottedCCM        0
SlottedPME        0
SingleMom         0
SingleDad         0
SingleParent      0
Disability        0
RegionCode_1      0
RegionCode_2      0
RegionCode_3      0
RegionCode_4      0
RegionCode_5      0
Destination_1     0
Destination_2     0
Destination_3     0
Destination_4     0
Destination_5     0
Destination_6     0
Destination_7     0
Destination_8     0
Destination_9     0
Destination_10    0
Destination_11    0
HHSize_2          0
HHSize_3          0
HHSize_4          0
HHSize_5          0
HHSize_6          0
HHSize_7          0
HHSize_8          0
HHSize_9          0
dtype: int64 False


In [9]:
##EDA: descriptive statistics
pd.set_option('display.max_columns', None)
reg.describe()

Unnamed: 0,FamilyID,ResidentID,Bednights,HHEmployed,HHTANF,Slotted,SlottedCCM,SlottedPME,SingleMom,SingleDad,SingleParent,Disability,RegionCode_1,RegionCode_2,RegionCode_3,RegionCode_4,RegionCode_5,Destination_1,Destination_2,Destination_3,Destination_4,Destination_5,Destination_6,Destination_7,Destination_8,Destination_9,Destination_10,Destination_11,HHSize_2,HHSize_3,HHSize_4,HHSize_5,HHSize_6,HHSize_7,HHSize_8,HHSize_9
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,459.344398,1758.165975,530.497925,0.726141,0.435685,0.680498,0.427386,0.253112,0.311203,0.062241,0.373444,0.004149,0.614108,0.099585,0.128631,0.149378,0.008299,0.124481,0.024896,0.26556,0.016598,0.136929,0.016598,0.06639,0.091286,0.029046,0.008299,0.219917,0.062241,0.248963,0.26556,0.195021,0.128631,0.029046,0.033195,0.037344
std,140.954579,467.250109,310.345548,0.446866,0.496878,0.467254,0.495729,0.435699,0.463949,0.242095,0.484725,0.064416,0.487818,0.300069,0.335488,0.357202,0.090908,0.330817,0.156133,0.44255,0.128024,0.344488,0.128024,0.249481,0.288615,0.168284,0.090908,0.415052,0.242095,0.433312,0.44255,0.397041,0.335488,0.168284,0.179518,0.189999
min,223.0,957.0,80.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,281.0,1200.0,282.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,531.0,2028.0,448.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,574.0,2110.0,877.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,596.0,2210.0,1078.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
pd.reset_option('display.max_columns')
##FamilyID and ResidentID are not features. They are indiviual IDs and can't be interpretted.
##Individuals stay at the facility for an average of 530 nights and for a median of 448 nights.
##The bednight distribution is left-skewed toward stays of fewer total bednights.
##The range is about 1000 nights with a minimum stay of 80 nights and a maximum stay of 1078 nights.
##The IQR is 595 nights meaning half of our clients' stays at the facility are just over a year and a half.
##The range between 2 STDs under and over the median is -172 to 1068 or 1240 bednights.
##The maximum stay is an outlier to the top of the range and severely effects the STD and mean, because
##it's impossible to have a stay of negative bednights. The STD being pulled so positive to accomodate 
##our longer stays forces an uninterpretable range below the median.