# Author: Makayla McKibben
## Course: DSC530 Data Exploration and Analysis
## Exercise: 9.2
## Date: 10.27.2024

In [2]:
# Import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from os.path import basename, exists
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split


In [3]:
# Get data
def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print("Downloaded " + local)

download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkstats2.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/thinkplot.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/nsfg.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/first.py")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dct")
download("https://github.com/AllenDowney/ThinkStats2/raw/master/code/2002FemPreg.dat.gz")

In [4]:
# Import relevant datasets and libraries
from __future__ import print_function, division
import nsfg
import first
import thinkstats2
import thinkplot
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

## Ex. 11-1

In [6]:
# Select all live births
preg = nsfg.ReadFemPreg()
live = preg[preg.outcome == 1]
live.head(18)

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875
5,6,1,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.5625
6,6,2,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,9.5625
7,6,3,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.375
8,7,1,,,,,5.0,,1.0,,...,0,0,0,3409.579565,3787.539,6911.879921,2,14,,7.5625
9,7,2,,,,,5.0,,1.0,,...,0,0,0,3409.579565,3787.539,6911.879921,2,14,,6.625


In [7]:
live.columns.to_list()

['caseid',
 'pregordr',
 'howpreg_n',
 'howpreg_p',
 'moscurrp',
 'nowprgdk',
 'pregend1',
 'pregend2',
 'nbrnaliv',
 'multbrth',
 'cmotpreg',
 'prgoutcome',
 'cmprgend',
 'flgdkmo1',
 'cmprgbeg',
 'ageatend',
 'hpageend',
 'gestasun_m',
 'gestasun_w',
 'wksgest',
 'mosgest',
 'dk1gest',
 'dk2gest',
 'dk3gest',
 'bpa_bdscheck1',
 'bpa_bdscheck2',
 'bpa_bdscheck3',
 'babysex',
 'birthwgt_lb',
 'birthwgt_oz',
 'lobthwgt',
 'babysex2',
 'birthwgt_lb2',
 'birthwgt_oz2',
 'lobthwgt2',
 'babysex3',
 'birthwgt_lb3',
 'birthwgt_oz3',
 'lobthwgt3',
 'cmbabdob',
 'kidage',
 'hpagelb',
 'birthplc',
 'paybirth1',
 'paybirth2',
 'paybirth3',
 'knewpreg',
 'trimestr',
 'ltrimest',
 'priorsmk',
 'postsmks',
 'npostsmk',
 'getprena',
 'bgnprena',
 'pnctrim',
 'lpnctri',
 'workpreg',
 'workborn',
 'didwork',
 'matweeks',
 'weeksdk',
 'matleave',
 'matchfound',
 'livehere',
 'alivenow',
 'cmkidied',
 'cmkidlft',
 'lastage',
 'wherenow',
 'legagree',
 'parenend',
 'anynurse',
 'fedsolid',
 'frsteatd_n',


In [8]:
live = live[['wksgest', 'babysex', 'birthord', 'agepreg', 'race', 'hispanic', 'pregnum']]
live

Unnamed: 0,wksgest,babysex,birthord,agepreg,race,hispanic,pregnum
0,39.0,1.0,1.0,33.16,2,2,2
1,39.0,2.0,2.0,39.25,2,2,2
2,39.0,1.0,1.0,14.33,1,2,3
3,39.0,2.0,2.0,17.83,1,2,3
4,39.0,2.0,3.0,18.33,1,2,3
...,...,...,...,...,...,...,...
13581,39.0,1.0,1.0,30.66,3,2,3
13584,34.0,2.0,1.0,26.91,2,2,2
13588,39.0,1.0,1.0,17.91,2,1,5
13591,39.0,1.0,2.0,21.58,2,1,5


In [9]:
live.dropna(inplace = True)
live

Unnamed: 0,wksgest,babysex,birthord,agepreg,race,hispanic,pregnum
0,39.0,1.0,1.0,33.16,2,2,2
1,39.0,2.0,2.0,39.25,2,2,2
2,39.0,1.0,1.0,14.33,1,2,3
3,39.0,2.0,2.0,17.83,1,2,3
4,39.0,2.0,3.0,18.33,1,2,3
...,...,...,...,...,...,...,...
13581,39.0,1.0,1.0,30.66,3,2,3
13584,34.0,2.0,1.0,26.91,2,2,2
13588,39.0,1.0,1.0,17.91,2,1,5
13591,39.0,1.0,2.0,21.58,2,1,5


In [10]:
# Set features and target variables
target = live['wksgest']
live.drop('wksgest', axis = 1, inplace = True)
features = live[['babysex', 'birthord', 'agepreg', 'race', 'hispanic', 'pregnum']]

In [11]:
# Split into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 18)

In [12]:
# Create min max scaler object
minmax_scaler = preprocessing.MinMaxScaler()

In [13]:
# Scale test data
minmax_train = minmax_scaler.fit_transform(features_train)

In [14]:
# Scale test data
minmax_test = minmax_scaler.fit_transform(features_test)

In [15]:
# Set variance threshold and create object
threshold = VarianceThreshold(threshold = 0.25)

In [16]:
# Create linear regression object
regressor = LinearRegression()

In [17]:
# Fit the regression model
just_lin = regressor.fit(features_train, target_train)

In [18]:
just_lin_pred = just_lin.predict(features_test)
just_lin_pred

array([38.25934661, 38.70051709, 38.34643332, ..., 38.41039125,
       38.72637671, 38.57437017])

In [19]:
# Fit the training data using the threshold
features_train_var25 = threshold.fit_transform(features_train)
features_train_var25

array([[ 2.  , 18.  ,  2.  ,  3.  ],
       [ 3.  , 37.16,  2.  ,  7.  ],
       [ 2.  , 22.66,  2.  ,  3.  ],
       ...,
       [ 3.  , 30.41,  2.  ,  3.  ],
       [ 1.  , 28.08,  1.  ,  2.  ],
       [ 2.  , 23.33,  2.  ,  3.  ]])

In [20]:
# Fit the training data using the threshold
features_test_var25 = threshold.fit_transform(features_test)
features_test_var25

array([[ 2.  , 26.08,  1.  ,  2.  ],
       [ 1.  , 18.25,  2.  ,  1.  ],
       [ 1.  , 22.75,  1.  ,  5.  ],
       ...,
       [ 4.  , 22.16,  1.  ,  4.  ],
       [ 1.  , 26.  ,  2.  ,  3.  ],
       [ 3.  , 31.41,  2.  ,  4.  ]])

In [21]:
# Fit the regression model
high_var = regressor.fit(features_train_var25, target_train)

In [22]:
# Predict test values using the high variance scaled test data
lin_high_var_test_pred = high_var.predict(features_test_var25)
lin_high_var_test_pred

array([38.40140242, 38.73363866, 38.3802433 , ..., 38.34778163,
       38.66066457, 38.58129859])

In [23]:
features_train_var25

array([[ 2.  , 18.  ,  2.  ,  3.  ],
       [ 3.  , 37.16,  2.  ,  7.  ],
       [ 2.  , 22.66,  2.  ,  3.  ],
       ...,
       [ 3.  , 30.41,  2.  ,  3.  ],
       [ 1.  , 28.08,  1.  ,  2.  ],
       [ 2.  , 23.33,  2.  ,  3.  ]])

Sex of the baby did not have a variance higher that 25% like the other five features selected. So the selection for features to build a model with would be 'birthord', 'agepreg', 'race', 'hispanic', and 'pregnum'.

## 11-3

In [26]:
resp = nsfg.ReadFemResp()

In [27]:
resp.index = resp.caseid

In [28]:
live = preg[preg.outcome == 1]
join = live.join(resp, on = 'caseid', rsuffix = '_r')

In [29]:
model_data = join[['numbabes', 'agepreg', 'race', 'educat', 'totincr']]
model_data

Unnamed: 0,numbabes,agepreg,race,educat,totincr
0,2,33.16,2,16,14
1,2,39.25,2,16,14
2,5,14.33,1,11,4
3,5,17.83,1,11,4
4,5,18.33,1,11,4
...,...,...,...,...,...
13581,1,30.66,3,17,8
13584,1,26.91,2,12,8
13588,3,17.91,2,13,10
13591,3,21.58,2,13,10


In [30]:
target = model_data['numbabes']
model_data.drop('numbabes', inplace = True, axis = 1)
features = model_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data.drop('numbabes', inplace = True, axis = 1)


In [31]:
# Split into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 18)

In [32]:
# Create decision tree
decision_tree = DecisionTreeClassifier(random_state = 18)

In [33]:
# Fit decision tree model to mushroom data
tree_model = decision_tree.fit(features_train, target_train)

In [34]:
pred_stats = [[35, 1, 16, 14]]

In [35]:
# Predict test results using our model
tree_model.predict(pred_stats)



array([2], dtype=int64)

I predict that she has two children.

## 11-4

In [38]:
marital_model = join[['rmarital', 'age_r', 'race', 'totincr', 'educat']]

In [39]:
target = marital_model['rmarital']
marital_model.drop('rmarital', inplace = True, axis = 1)
features = marital_model

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  marital_model.drop('rmarital', inplace = True, axis = 1)


In [40]:
# Split into train and test sets
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.2, random_state = 18)

In [41]:
# Fit decision tree model to mushroom data
tree_model = decision_tree.fit(features_train, target_train)

In [42]:
pred_stats = [[25, 2, 11, 12]]

In [43]:
# Predict test results using our model
tree_model.predict(pred_stats)



array([1], dtype=int64)

I would guess that the woman is currently married to a partner of the opposite sex.