In [2]:
# importing libraries
import pandas as pd # data science essentials
import matplotlib.pyplot as plt # essential graphical output
import seaborn as sns # enhanced graphical output
import numpy as np # mathematical essentials
import statsmodels.formula.api as smf # regression modeling
import sklearn.linear_model # linear models
from sklearn.model_selection import train_test_split # train/test split
from sklearn.linear_model import LinearRegression # linear regression (scikit-learn)

# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


# specifying file name
file = "./__datasets/birthweight_low.xlsx"


# reading the file into Python
weight = pd.read_excel(io    = file,
                       header = 0,
                       sheet_name = 0)
weight.head(n=10)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght
0,69,,5,2.0,62,,4,7,23,9,1,0,1,0,0,1,0,697
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956
5,67,11.0,4,8.0,40,8.0,4,9,16,14,0,1,0,0,1,0,0,1984
6,54,12.0,2,12.0,46,12.0,9,9,17,12,1,0,1,0,0,1,0,2050
7,71,14.0,4,7.0,51,11.0,9,8,15,13,0,1,0,0,1,0,0,2068
8,56,12.0,1,9.0,53,14.0,8,9,14,9,1,1,0,0,1,0,0,2148
9,58,12.0,2,12.0,61,16.0,9,9,13,6,0,0,1,0,0,1,0,2180


In [3]:
for column in weight:
    print(column)

mage
meduc
monpre
npvis
fage
feduc
omaps
fmaps
cigs
drink
male
mwhte
mblck
moth
fwhte
fblck
foth
bwght


In [4]:
weight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mage    196 non-null    int64  
 1   meduc   193 non-null    float64
 2   monpre  196 non-null    int64  
 3   npvis   193 non-null    float64
 4   fage    196 non-null    int64  
 5   feduc   189 non-null    float64
 6   omaps   196 non-null    int64  
 7   fmaps   196 non-null    int64  
 8   cigs    196 non-null    int64  
 9   drink   196 non-null    int64  
 10  male    196 non-null    int64  
 11  mwhte   196 non-null    int64  
 12  mblck   196 non-null    int64  
 13  moth    196 non-null    int64  
 14  fwhte   196 non-null    int64  
 15  fblck   196 non-null    int64  
 16  foth    196 non-null    int64  
 17  bwght   196 non-null    int64  
dtypes: float64(3), int64(15)
memory usage: 27.7 KB


In [5]:
weight.isnull().sum()

mage      0
meduc     3
monpre    0
npvis     3
fage      0
feduc     7
omaps     0
fmaps     0
cigs      0
drink     0
male      0
mwhte     0
mblck     0
moth      0
fwhte     0
fblck     0
foth      0
bwght     0
dtype: int64

In [6]:
meduc_median = weight['meduc'].median()
weight['meduc'].fillna(value = meduc_median, inplace = True)

npvis_median = weight['npvis'].median()
weight['npvis'].fillna(value = npvis_median, inplace = True)

feduc_median = weight['feduc'].median()
weight['feduc'].fillna(value = feduc_median, inplace = True)

In [7]:
weight.isnull().sum()

mage      0
meduc     0
monpre    0
npvis     0
fage      0
feduc     0
omaps     0
fmaps     0
cigs      0
drink     0
male      0
mwhte     0
mblck     0
moth      0
fwhte     0
fblck     0
foth      0
bwght     0
dtype: int64

In [8]:
# Create cigs + drinks and cigs * drinks
weight['cigsdrinks2'] = weight['cigs'] * weight['drink']
weight['cigsdrinks'] = weight['cigs'] + weight['drink']

In [9]:
## Make some spread out variables into logs
weight['log_meduc']      = np.log(weight['meduc'])
weight['log_monpre']     = np.log(weight['monpre'])
weight['log_fage']       = np.log(weight['fage'])
weight['log_mage']       = np.log(weight['mage'])
weight['log_feduc']      = np.log(weight['feduc'])
weight['log_npvis']      = np.log(weight['npvis'])
weight['log_bwght']      = np.log(weight['bwght'])


#Add a value to the end since they ended up as 0/-inf
weight['log_cigs']    = np.log(weight['cigs'] + 0.00001)
weight['log_drink']   = np.log(weight['drink'] + 0.00001)
weight['log_cigsdrinks'] = np.log(weight['cigsdrinks'] + 0.000001)
weight['log_cigsdrinks2'] = np.log(weight['cigsdrinks2'] + 0.000001)

weight.describe()

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght,cigsdrinks2,cigsdrinks,log_meduc,log_monpre,log_fage,log_mage,log_feduc,log_npvis,log_bwght,log_cigs,log_drink,log_cigsdrinks,log_cigsdrinks2
count,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,40.153061,13.913265,2.341837,11.607143,39.290816,13.852041,8.193878,8.964286,10.928571,5.397959,0.55102,0.270408,0.382653,0.346939,0.346939,0.341837,0.311224,3334.086735,65.47449,16.326531,2.62183,0.718551,3.647016,3.662376,2.597524,2.392537,8.088802,1.615902,0.857949,2.48548,2.251543
std,10.250055,2.040017,1.355136,4.234625,8.982725,2.586661,1.576482,0.651428,6.101282,3.001674,0.498664,0.445308,0.487279,0.477215,0.477215,0.47554,0.46418,646.700904,58.627677,7.698519,0.150067,0.503378,0.216647,0.244219,0.313256,0.352281,0.232252,2.972775,3.081299,1.774365,5.221131
min,23.0,8.0,1.0,2.0,23.0,1.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,697.0,0.0,0.0,2.079442,0.0,3.135494,3.135494,0.0,0.693147,6.546785,-11.512925,-11.512925,-13.815511,-13.815511
25%,33.0,12.0,2.0,10.0,34.75,12.0,8.0,9.0,6.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2916.25,20.0,11.0,2.484907,0.693147,3.548101,3.496508,2.484907,2.302585,7.978051,1.791761,1.386297,2.397895,2.995732
50%,39.0,14.0,2.0,12.0,38.0,14.0,9.0,9.0,11.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3452.0,48.0,16.0,2.639057,0.693147,3.637586,3.663562,2.639057,2.484907,8.146709,2.397896,1.60944,2.772589,3.871201
75%,46.0,16.0,3.0,12.0,43.0,16.0,9.0,9.0,15.25,7.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3759.5,85.75,21.0,2.772589,1.098612,3.7612,3.828641,2.772589,2.484907,8.23204,2.724185,1.979294,3.044522,4.451323
max,71.0,17.0,8.0,35.0,73.0,17.0,10.0,10.0,25.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4933.0,275.0,36.0,2.833213,2.079442,4.290459,4.26268,2.833213,3.555348,8.503703,3.218876,2.639058,3.583519,5.616771


In [13]:
# Using a for loop to make bins and then check for any NA's
for index, col in weight.iterrows():

#Create bins for mage
    if weight.loc[index, "mage"] <= 30: 
        weight.loc[index, "bin_mage"] = "f_0to30"
    elif weight.loc[index, "mage"] <= 50: 
        weight.loc[index, "bin_mage"] = "f_31to50"
    elif weight.loc[index, "mage"] > 50: 
        weight.loc[index, "bin_mage"] = "f_51plus"
    else:
        print('Mage Bin Error')
        
#Create bins for meduc                
    if weight.loc[index, "meduc"] <= 12:
        weight.loc[index, "bin_meduc"] = '0'
    elif weight.loc[index, "meduc"] > 12:
        weight.loc[index, "bin_meduc"] = '1'
    else:
        print("Meduc Bin Error")
        
#Create bins for feduc        
    if weight.loc[index, "feduc"] <=  12: 
        weight.loc[index, "bin_feduc"] = '0'
    elif weight.loc[index, "feduc"] >  12: 
        weight.loc[index, "bin_feduc"] = '1'
    else:
        print('Feduc Bin Error')
        
#Create bins for cigs            
    if weight.loc[index, "cigs"] <= 2:
        weight.loc[index, "bin_cigs"] ='0'
    elif weight.loc[index, "cigs"] <=20:
        weight.loc[index, "bin_cigs"] = '1'
    elif weight.loc[index, "cigs"] >=21:
        weight.loc[index, "bin_cigs"] = '2'
    else:
        print("Cigs Bin Error")       
   
 #Create bins for drink
    if weight.loc[index, "drink"] <=  6: 
        weight.loc[index, "bin_drink"] = '0'
    elif weight.loc[index, "drink"] >  6: 
        weight.loc[index, "bin_drink"] = '1'
    else:
        print('Drink Bin Error')

        
        
#Create dummies for the variables with only a few categories
dummy_weight = pd.get_dummies(columns = ["bin_mage", "bin_cigs", "bin_drink", "bin_meduc", "bin_feduc"], data = weight)
weight = dummy_weight
weight.head(n = 10)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght,cigsdrinks2,cigsdrinks,log_meduc,log_monpre,log_fage,log_mage,log_feduc,log_npvis,log_bwght,log_cigs,log_drink,log_cigsdrinks,log_cigsdrinks2,bin_mage_f_0to30,bin_mage_f_31to50,bin_mage_f_51plus,bin_cigs_0,bin_cigs_1,bin_cigs_2,bin_drink_0,bin_drink_1,bin_meduc_0,bin_meduc_1,bin_feduc_0,bin_feduc_1,bin_mage_f_0to30.1,bin_mage_f_31to50.1,bin_mage_f_51plus.1,bin_cigs_0.1,bin_cigs_1.1,bin_cigs_2.1,bin_drink_0.1,bin_drink_1.1,bin_meduc_0.1,bin_meduc_1.1,bin_feduc_0.1,bin_feduc_1.1
0,69,14.0,5,2.0,62,14.0,4,7,23,9,1,0,1,0,0,1,0,697,207,32,2.639057,1.609438,4.127134,4.234107,2.639057,0.693147,6.546785,3.135495,2.197226,3.465736,5.332719,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290,275,36,2.484907,1.098612,4.110874,4.219508,2.397895,2.302585,7.162397,3.218876,2.397896,3.583519,5.616771,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,1,0
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490,252,33,2.484907,1.098612,3.828641,4.26268,2.484907,1.791759,7.306531,3.044523,2.484907,3.496508,5.529429,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,1,0
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720,210,31,2.772589,0.0,3.871201,4.077537,2.772589,2.079442,7.45008,3.044523,2.302586,3.433987,5.347108,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956,221,30,2.484907,1.386294,3.663562,3.871201,2.484907,1.791759,7.578657,2.833214,2.56495,3.401197,5.398163,0,1,0,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0
5,67,11.0,4,8.0,40,8.0,4,9,16,14,0,1,0,0,1,0,0,1984,224,30,2.397895,1.386294,3.688879,4.204693,2.079442,2.079442,7.59287,2.772589,2.639058,3.401197,5.411646,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0
6,54,12.0,2,12.0,46,12.0,9,9,17,12,1,0,1,0,0,1,0,2050,204,29,2.484907,0.693147,3.828641,3.988984,2.484907,2.484907,7.625595,2.833214,2.484907,3.367296,5.31812,0,0,1,0,1,0,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0
7,71,14.0,4,7.0,51,11.0,9,8,15,13,0,1,0,0,1,0,0,2068,195,28,2.639057,1.386294,3.931826,4.26268,2.397895,1.94591,7.634337,2.708051,2.56495,3.332205,5.273,0,0,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,1,1,0
8,56,12.0,1,9.0,53,14.0,8,9,14,9,1,1,0,0,1,0,0,2148,126,23,2.484907,0.0,3.970292,4.025352,2.639057,2.197225,7.672292,2.639058,2.197226,3.135494,4.836282,0,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,1,0,0,1,1,0,0,1
9,58,12.0,2,12.0,61,16.0,9,9,13,6,0,0,1,0,0,1,0,2180,78,19,2.484907,0.693147,4.110874,4.060443,2.772589,2.484907,7.68708,2.56495,1.791761,2.944439,4.356709,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1


In [12]:
weight.head()

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght,cigsdrinks2,cigsdrinks,log_meduc,log_monpre,log_fage,log_mage,log_feduc,log_npvis,log_bwght,log_cigs,log_drink,log_cigsdrinks,log_cigsdrinks2,bin_mage_f_0to30,bin_mage_f_31to50,bin_mage_f_51plus,bin_cigs_0,bin_cigs_1,bin_cigs_2,bin_drink_0,bin_drink_1,bin_meduc_0,bin_meduc_1,bin_feduc_0,bin_feduc_1
0,69,14.0,5,2.0,62,14.0,4,7,23,9,1,0,1,0,0,1,0,697,207,32,2.639057,1.609438,4.127134,4.234107,2.639057,0.693147,6.546785,3.135495,2.197226,3.465736,5.332719,0,0,1,0,0,1,0,1,0,1,0,1
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290,275,36,2.484907,1.098612,4.110874,4.219508,2.397895,2.302585,7.162397,3.218876,2.397896,3.583519,5.616771,0,0,1,0,0,1,0,1,1,0,1,0
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490,252,33,2.484907,1.098612,3.828641,4.26268,2.484907,1.791759,7.306531,3.044523,2.484907,3.496508,5.529429,0,0,1,0,0,1,0,1,1,0,1,0
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720,210,31,2.772589,0.0,3.871201,4.077537,2.772589,2.079442,7.45008,3.044523,2.302586,3.433987,5.347108,0,0,1,0,0,1,0,1,0,1,0,1
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956,221,30,2.484907,1.386294,3.663562,3.871201,2.484907,1.791759,7.578657,2.833214,2.56495,3.401197,5.398163,0,1,0,0,1,0,0,1,1,0,1,0


In [10]:
# List all of the columns and add a comma
print("Columns: ")
for col in weight.columns:
    print(col + ',') 


Columns: 
mage,
meduc,
monpre,
npvis,
fage,
feduc,
omaps,
fmaps,
cigs,
drink,
male,
mwhte,
mblck,
moth,
fwhte,
fblck,
foth,
bwght,
cigsdrinks2,
cigsdrinks,
log_meduc,
log_monpre,
log_fage,
log_mage,
log_feduc,
log_npvis,
log_bwght,
log_cigs,
log_drink,
log_cigsdrinks,
log_cigsdrinks2,
bin_mage_f_0to30,
bin_mage_f_31to50,
bin_mage_f_51plus,
bin_cigs_0,
bin_cigs_1,
bin_cigs_2,
bin_drink_0,
bin_drink_1,
bin_meduc_0,
bin_meduc_1,
bin_feduc_0,
bin_feduc_1,


In [14]:
for column in weight:
    print(column + ' +')

mage +
meduc +
monpre +
npvis +
fage +
feduc +
omaps +
fmaps +
cigs +
drink +
male +
mwhte +
mblck +
moth +
fwhte +
fblck +
foth +
bwght +
cigsdrinks2 +
cigsdrinks +
log_meduc +
log_monpre +
log_fage +
log_mage +
log_feduc +
log_npvis +
log_bwght +
log_cigs +
log_drink +
log_cigsdrinks +
log_cigsdrinks2 +
bin_mage_f_0to30 +
bin_mage_f_31to50 +
bin_mage_f_51plus +
bin_cigs_0 +
bin_cigs_1 +
bin_cigs_2 +
bin_drink_0 +
bin_drink_1 +
bin_meduc_0 +
bin_meduc_1 +
bin_feduc_0 +
bin_feduc_1 +
bin_mage_f_0to30 +
bin_mage_f_31to50 +
bin_mage_f_51plus +
bin_cigs_0 +
bin_cigs_1 +
bin_cigs_2 +
bin_drink_0 +
bin_drink_1 +
bin_meduc_0 +
bin_meduc_1 +
bin_feduc_0 +
bin_feduc_1 +


In [15]:
weight.head(n=5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,male,mwhte,mblck,moth,fwhte,fblck,foth,bwght,cigsdrinks2,cigsdrinks,log_meduc,log_monpre,log_fage,log_mage,log_feduc,log_npvis,log_bwght,log_cigs,log_drink,log_cigsdrinks,log_cigsdrinks2,bin_mage_f_0to30,bin_mage_f_31to50,bin_mage_f_51plus,bin_cigs_0,bin_cigs_1,bin_cigs_2,bin_drink_0,bin_drink_1,bin_meduc_0,bin_meduc_1,bin_feduc_0,bin_feduc_1,bin_mage_f_0to30.1,bin_mage_f_31to50.1,bin_mage_f_51plus.1,bin_cigs_0.1,bin_cigs_1.1,bin_cigs_2.1,bin_drink_0.1,bin_drink_1.1,bin_meduc_0.1,bin_meduc_1.1,bin_feduc_0.1,bin_feduc_1.1
0,69,14.0,5,2.0,62,14.0,4,7,23,9,1,0,1,0,0,1,0,697,207,32,2.639057,1.609438,4.127134,4.234107,2.639057,0.693147,6.546785,3.135495,2.197226,3.465736,5.332719,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1
1,68,12.0,3,10.0,61,11.0,4,6,25,11,1,1,0,0,1,0,0,1290,275,36,2.484907,1.098612,4.110874,4.219508,2.397895,2.302585,7.162397,3.218876,2.397896,3.583519,5.616771,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,1,0
2,71,12.0,3,6.0,46,12.0,2,7,21,12,1,0,1,0,0,1,0,1490,252,33,2.484907,1.098612,3.828641,4.26268,2.484907,1.791759,7.306531,3.044523,2.484907,3.496508,5.529429,0,0,1,0,0,1,0,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,1,0
3,59,16.0,1,8.0,48,16.0,7,8,21,10,0,0,0,1,0,0,1,1720,210,31,2.772589,0.0,3.871201,4.077537,2.772589,2.079442,7.45008,3.044523,2.302586,3.433987,5.347108,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,1
4,48,12.0,4,6.0,39,12.0,2,9,17,13,0,1,0,0,1,0,0,1956,221,30,2.484907,1.386294,3.663562,3.871201,2.484907,1.791759,7.578657,2.833214,2.56495,3.401197,5.398163,0,1,0,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,1,0


In [16]:
# Step 1: build a model
lm_best = smf.ols(formula =  """bwght ~ mage +
                                       
                                        
                                        
                                        
                                        
                                        cigs +
                                        drink +
                                        
                                   
                                        cigsdrinks2 +
                                        cigsdrinks +
                                       
                                    
                                        
                                        
                                        
                                       
                                       
                                       
                                        bin_mage_f_31to50 
                                        
                                        
                                        
                                        
                                        
                                       

                                           """,
                                data = weight)


# Step 2: fit the model based on the data
results = lm_best.fit()



# Step 3: analyze the summary output
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.723
Model:                            OLS   Adj. R-squared:                  0.716
Method:                 Least Squares   F-statistic:                     99.39
Date:                Sun, 05 Dec 2021   Prob (F-statistic):           4.14e-51
Time:                        00:17:00   Log-Likelihood:                -1420.1
No. Observations:                 196   AIC:                             2852.
Df Residuals:                     190   BIC:                             2872.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept             4595.4128 

In [15]:
# preparing explanatory variable data
weight_data   = weight.drop(['bwght', 'log_bwght', 'fmaps', 'omaps'],
                            axis = 1)


# preparing response variables
weight_target = weight.loc[ : , "bwght"]
log_weight_target = weight.loc[ : , 'log_bwght']


# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            weight_data,
            weight_target,
            test_size = 0.25,
            random_state = 219)


# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}
""")


Training Data
-------------
X-side: (147, 39)
y-side: (147,)


Testing Data
------------
X-side: (49, 39)
y-side: (49,)



In [59]:
# Declaring the set of x variables
x_variables =  [
'feduc',
'drink',
'male',
'mwhte',
'moth',
'cigsdrinks2',
'cigsdrinks',
'log_meduc',
'log_fage',
'log_feduc',
'log_npvis',
'log_cigsdrinks',
'bin_mage_f_31to50',
'bin_mage_f_51plus',
'bin_cigs_2',
'bin_drink_0',
'bin_meduc_0',
'log_mage']

# For loop that adds a + to the end of each of the variables
for val in x_variables:
    print(f"{val} +")

feduc +
drink +
male +
mwhte +
moth +
cigsdrinks2 +
cigsdrinks +
log_meduc +
log_fage +
log_feduc +
log_npvis +
log_cigsdrinks +
bin_mage_f_31to50 +
bin_mage_f_51plus +
bin_cigs_2 +
bin_drink_0 +
bin_meduc_0 +
log_mage +


In [60]:
# merging X_train and y_train so that they can be used in statsmodels
weight_train = pd.concat([x_train, y_train], axis = 1)


# Step 1: build a model
lm_best = smf.ols(formula =  """bwght ~ mage +
                                        drink +
                                        cigs +
                                        cigsdrinks2 +
                                        cigsdrinks +
                                        bin_mage_f_31to50 
                                        """,
                                data = weight_train)


# Step 2: fit the model based on the data
results = lm_best.fit()



# Step 3: analyze the summary output
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.735
Model:                            OLS   Adj. R-squared:                  0.725
Method:                 Least Squares   F-statistic:                     78.15
Date:                Wed, 24 Nov 2021   Prob (F-statistic):           6.62e-39
Time:                        21:13:59   Log-Likelihood:                -1067.1
No. Observations:                 147   AIC:                             2146.
Df Residuals:                     141   BIC:                             2164.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          4681.6582    169.75

In [63]:
# applying modelin scikit-learn

# preparing x-variables from the OLS model
weight_ols_data = weight.loc[ : , x_variables]


# preparing response variable
weight_target = weight.loc[ : , 'bwght']


###############################################
## setting up more than one train-test split ##
###############################################
# FULL X-dataset (normal Y)
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            weight_data,     # x-variables
            weight_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


# OLS p-value x-dataset (normal Y)
x_train_OLS, x_test_OLS, y_train_OLS, y_test_OLS = train_test_split(
            weight_ols_data,         # x-variables
            weight_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


In [64]:
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train_OLS, y_train_OLS)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test_OLS)


# SCORING the results
print('OLS Training Score :', lr.score(x_train_OLS, y_train_OLS).round(4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test_OLS, y_test_OLS).round(4)) # using R-square

lr_train_score = lr.score(x_train_OLS, y_train_OLS).round(4)
lr_test_score = lr.score(x_test_OLS, y_test_OLS).round(4)

# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

OLS Training Score : 0.7607
OLS Testing Score  : 0.5374
OLS Train-Test Gap : 0.2233


In [65]:
import sklearn.linear_model # linear models

# INSTANTIATING a model object
lasso_model = sklearn.linear_model.Lasso(alpha = 1.0,
                                         normalize = True) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_FULL, y_train_FULL).round(4))
print('Lasso Testing Score  :', lasso_model.score(x_test_FULL, y_test_FULL).round(4))

lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4)
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)

## the following code has been provided for you ##

# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(4) # using R-square
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(4)   # using R-square


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

Lasso Training Score : 0.7617
Lasso Testing Score  : 0.6124
Lasso Train-Test Gap : 0.1493


In [34]:
# zipping each feature name to its coefficient
lasso_model_values = zip(weight_data.columns, lasso_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# checking the results
for pair in lasso_model_lst:
    print(pair)

('intercept', 3703.9)
('mage', -10.02)
('meduc', 9.63)
('monpre', 8.54)
('npvis', -4.97)
('fage', -0.41)
('feduc', 0.0)
('cigs', -0.0)
('drink', -37.62)
('male', 43.43)
('mwhte', 2.23)
('mblck', 0.0)
('moth', -66.26)
('fwhte', 0.0)
('fblck', -0.0)
('foth', -0.0)
('cigsdrinks2', -3.1)
('cigsdrinks', -20.79)
('log_meduc', 0.0)
('log_monpre', 0.0)
('log_fage', -0.0)
('log_mage', -0.0)
('log_feduc', 119.99)
('log_npvis', 159.74)
('log_cigs', -0.0)
('log_drink', 0.79)
('log_cigsdrinks', 3.13)
('log_cigsdrinks2', -0.0)
('bin_mage_f_0to30', -0.0)
('bin_mage_f_31to50', 64.13)
('bin_mage_f_51plus', -165.38)
('bin_cigs_0', -0.0)
('bin_cigs_1', 0.0)
('bin_cigs_2', -73.24)
('bin_drink_0', 37.73)
('bin_drink_1', -0.0)
('bin_meduc_0', -54.3)
('bin_meduc_1', 0.0)
('bin_feduc_0', -0.0)
('bin_feduc_1', 0.0)


In [66]:
# dropping coefficients that are equal to zero

# printing out each feature-coefficient pair one by one
for feature, coefficient in lasso_model_lst:
        
        if coefficient == 0:
            lasso_model_lst.remove((feature, coefficient))

            
# checking the results
for pair in lasso_model_lst:
    print(pair)

('intercept', 3703.9)
('mage', -10.02)
('meduc', 9.63)
('monpre', 8.54)
('npvis', -4.97)
('fage', -0.41)
('drink', -37.62)
('male', 43.43)
('mwhte', 2.23)
('moth', -66.26)
('cigsdrinks2', -3.1)
('cigsdrinks', -20.79)
('log_mage', -0.0)
('log_feduc', 119.99)
('log_npvis', 159.74)
('log_drink', 0.79)
('log_cigsdrinks', 3.13)
('bin_mage_f_31to50', 64.13)
('bin_mage_f_51plus', -165.38)
('bin_cigs_2', -73.24)
('bin_drink_0', 37.73)
('bin_meduc_0', -54.3)


In [36]:
# INSTANTIATING a model object
ard_model = sklearn.linear_model.ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test_FULL)


print('Training Score:', ard_model.score(x_train_FULL, y_train_FULL).round(4))
print('Testing Score :',  ard_model.score(x_test_FULL, y_test_FULL).round(4))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train_FULL, y_train_FULL).round(4)
ard_test_score  = ard_model.score(x_test_FULL, y_test_FULL).round(4)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

Training Score: 0.7507
Testing Score : 0.6373
ARD Train-Test Gap : 0.1134


In [67]:
# import libraries for KNN 
from sklearn.neighbors import KNeighborsRegressor # KNN for Regression
from sklearn.preprocessing import StandardScaler # standard scaler


In [68]:
# INSTANTIATING a StandardScaler() object
scaler = StandardScaler()


# FITTING the scaler with housing_data
scaler.fit(weight_data)


# TRANSFORMING our data after fit
X_scaled = scaler.transform(weight_data)


# converting scaled data into a DataFrame
X_scaled_df = pd.DataFrame(X_scaled)


# checking the results
X_scaled_df.describe().round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
count,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.68,-2.91,-0.99,-2.27,-1.82,-4.98,-1.8,-1.8,-1.11,-0.61,-0.79,-0.73,-0.73,-0.72,-0.67,-1.12,-2.13,-3.62,-1.43,-2.37,-2.16,-8.31,-4.84,-4.43,-4.03,-9.21,-3.09,-0.45,-1.51,-0.4,-0.35,-2.18,-0.27,-1.37,-0.73,-0.76,-1.31,-0.82,-1.22
25%,-0.7,-0.94,-0.25,-0.38,-0.51,-0.72,-0.81,-0.47,-1.11,-0.61,-0.79,-0.73,-0.73,-0.72,-0.67,-0.78,-0.69,-0.91,-0.05,-0.46,-0.68,-0.36,-0.26,0.06,0.17,-0.05,0.14,-0.45,-1.51,-0.4,-0.35,0.46,-0.27,-1.37,-0.73,-0.76,-1.31,-0.82,-1.22
50%,-0.11,0.04,-0.25,0.09,-0.14,0.06,0.01,-0.13,0.9,-0.61,-0.79,-0.73,-0.73,-0.72,-0.67,-0.3,-0.04,0.12,-0.05,-0.04,0.0,0.13,0.26,0.26,0.24,0.16,0.31,-0.45,0.66,-0.4,-0.35,0.46,-0.27,0.73,-0.73,-0.76,0.76,-0.82,0.82
75%,0.57,1.03,0.49,0.09,0.41,0.83,0.71,0.62,0.9,1.64,1.27,1.37,1.37,1.39,1.49,0.35,0.61,1.01,0.76,0.53,0.68,0.56,0.26,0.37,0.36,0.32,0.42,-0.45,0.66,-0.4,-0.35,0.46,-0.27,0.73,1.37,1.31,0.76,1.22,0.82
max,3.02,1.52,4.19,5.54,3.76,1.22,2.31,2.87,0.9,1.64,1.27,1.37,1.37,1.39,1.49,3.58,2.56,1.41,2.71,2.98,2.46,0.75,3.31,0.54,0.58,0.62,0.65,2.22,0.66,2.5,2.89,0.46,3.75,0.73,1.37,1.31,0.76,1.22,0.82


In [69]:
# adding labels to the scaled DataFrame
X_scaled_df.columns = weight_data.columns

#  Checking pre- and post-scaling of the data
print(f"""
Dataset BEFORE Scaling
----------------------
{np.var(weight_data)}


Dataset AFTER Scaling
----------------------
{np.var(X_scaled_df)}
""")


Dataset BEFORE Scaling
----------------------
mage                  104.527593
meduc                   4.140436
monpre                  1.827025
npvis                  17.840561
fage                   80.277671
feduc                   6.656680
cigs                   37.035714
drink                   8.964077
male                    0.247397
mwhte                   0.197288
mblck                   0.236230
moth                    0.226572
fwhte                   0.226572
fblck                   0.224984
foth                    0.214364
cigsdrinks2          3419.667717
cigsdrinks             58.964806
log_meduc               0.022405
log_monpre              0.252097
log_fage                0.046697
log_mage                0.059339
log_feduc               0.097629
log_npvis               0.123468
log_cigs                8.792300
log_drink               9.445960
log_cigsdrinks          3.132308
log_cigsdrinks2        27.121126
bin_mage_f_0to30        0.140020
bin_mage_f_31to50       0.212

In [70]:
# this is the exact code we were using before
x_train, x_test, y_train, y_test = train_test_split(
            weight_data,
            weight_target,
            test_size = 0.25,
            random_state = 219)

In [71]:
# INSTANTIATING a KNN model object
knn_reg = KNeighborsRegressor(algorithm = 'auto',
                              n_neighbors = 5)


# FITTING to the training data
knn_fit = knn_reg.fit(x_train, y_train)


# PREDICTING on new data
knn_reg_pred = knn_fit.predict(x_test)


# SCORING the results
print('KNN Training Score:', knn_reg.score(x_train, y_train).round(4))
print('KNN Testing Score :',  knn_reg.score(x_test, y_test).round(4))


# saving scoring data for future use
knn_reg_score_train = knn_reg.score(x_train, y_train).round(4)
knn_reg_score_test  = knn_reg.score(x_test, y_test).round(4)


# displaying and saving the gap between training and testing
print('KNN Train-Test Gap:', abs(knn_reg_score_train - knn_reg_score_test).round(4))
knn_reg_test_gap = abs(knn_reg_score_train - knn_reg_score_test).round(4)

KNN Training Score: 0.7552
KNN Testing Score : 0.6183
KNN Train-Test Gap: 0.1369


In [72]:
# Compare the results of each test

print(f"""
Model       Train Score      Test Score       Gap
-----       -----------      ----------      -----
OLS         {lr_train_score}           {lr_test_score}           {abs(lr_train_score-lr_test_score).round(4)}
Lasso       {lasso_train_score}           {lasso_test_score}           {abs(lasso_train_score-lasso_test_score).round(4)}
ARD - BEST  {ard_train_score}           {ard_test_score}           {abs(ard_train_score-ard_test_score).round(4)}
KNN         {knn_reg_score_train}           {knn_reg_score_test}           {abs(knn_reg_score_train - knn_reg_score_test).round(4)}""")



Model       Train Score      Test Score       Gap
-----       -----------      ----------      -----
OLS         0.7607           0.5374           0.2233
Lasso       0.7617           0.6124           0.1493
ARD - BEST  0.7507           0.6373           0.1134
KNN         0.7552           0.6183           0.1369
