In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Childhood Respiratory Disease

Keywords: polynomial regression, multiple regression.

## Description

FEV (forced expiratory volume) is an index of pulmonary function that measures the volume of air expelled after one second of constant effort. The data contains determinations of FEV on 654 children ages 6-22 who were seen in the Childhood Respiratory Desease Study in 1980 in East Boston, Massachusetts. The data are part of a larger study to follow the change in pulmonary function over time in children.

ID	 - 	ID number
Age	 - 	years
FEV	 - 	litres
Height	 - 	inches
Sex	 - 	Male or Female
Smoker	 - 	Non = nonsmoker, Current = current smoker


## Source

Tager, I. B., Weiss, S. T., Rosner, B., and Speizer, F. E. (1979). Effect of parental cigarette smoking on pulmonary function in children. American Journal of Epidemiology, 110, 15-26.
Rosner, B. (1990). Fundamentals of Biostatistics, 3rd Edition. PWS-Kent, Boston, Massachusetts.


In [4]:
# Read the csv file into a pandas DataFrame

ufcData= pd.read_csv('UFCdatav1.csv')
ufcData.head()

Unnamed: 0,Prev,Age,Height,Hometown,ID,Location,Name,Weight,Date,Event_ID,Fight_ID,Last_round,Max_round,winby,winner,win_loss
0,0,38.0,193.0,Hounslow England,808,Amsterdam The Netherlands,Alistair Overeem,120.0,2/3/14,646,4580,3,3,DEC,win,1
1,0,36.0,172.0,"Chicago, Illinois United States",1054,"Chicago, Illinois United States",Ricardo Lamas,65.0,2/3/14,646,4589,5,5,DEC,loss,0
2,0,39.0,167.0,"Isla Vista , California USA",959,"Sacramento, California USA",Urijah Faber,61.0,2/3/14,646,4590,1,5,KO/TKO,loss,0
3,0,33.0,167.0,"San Diego, CA USA",1056,"San Diego, CA USA",Danny Martinez,56.0,2/3/14,646,4605,3,3,DEC,loss,0
4,0,36.0,185.0,Southampton England,2005,Southampton England,Tom Watson,84.0,2/3/14,646,4631,3,3,DEC,loss,0


In [5]:
data2 = ufcData[['Age', 'Height', 'Weight', 'ID', 'Date', 'Event_ID', 'win_loss']]
data2.head()

Unnamed: 0,Age,Height,Weight,ID,Date,Event_ID,win_loss
0,38.0,193.0,120.0,808,2/3/14,646,1
1,36.0,172.0,65.0,1054,2/3/14,646,0
2,39.0,167.0,61.0,959,2/3/14,646,0
3,33.0,167.0,56.0,1056,2/3/14,646,0
4,36.0,185.0,84.0,2005,2/3/14,646,0


In [9]:
# Use Pandas get_dummies to convert categorical data
# YOUR CODE HERE
#ufcData = pd.get_dummies(ufcData)
#ufcData.head()

In [10]:
# Drop extra dummy columns
# YOUR CODE HERE
#ufcData = ufcData.drop(columns=['Location', 'Name'])
#ufcData.head()

In [24]:
# Assign X (data) and y (target)
# YOUR CODE HERE
X = data2[['Age']]
y = data2['ID'].values.reshape(-1, 1)
print(X.shape, y.shape)

(4636, 1) (4636, 1)


In [25]:
# Split the data into training and testing

# YOUR CODE HERE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

# YOUR CODE HERE
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [27]:
# Transform the training and testing data using the X_scaler and y_scaler models

# YOUR CODE HERE
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [28]:
# Create a LinearRegression model and fit it to the scaled training data
from sklearn.linear_model import LinearRegression

# YOUR CODE HERE
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [22]:
# Make predictions using a fitted model
# Plot the difference between the model predicted values and actual y values, versus the model predicted values
# Hint: You can predict values of X training and testing data using the model.predict() method on a fitted model

# YOUR CODE HERE
predictions = model.predict(X_test_scaled)
model.fit(X_train_scaled, y_train_scaled)
plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="orange", label="Trainind Data")
plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="green", label="Test Data")
plt.legend()
plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
plt.title("Residual Plot")
plt.show()

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [23]:
# Used X_test_scaled, y_test_scaled, and model.predict(X_test_scaled) to calculate MSE and R2
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = model.score(X_test_scaled, y_test_scaled)
# YOUR CODE HERE

print(f"MSE: {MSE}, R2: {r2}")

NameError: name 'predictions' is not defined

In [17]:
# LASSO model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=.01).fit(X_train_scaled, y_train_scaled)
# YOUR CODE HERE
predictions = lasso.predict(X_test_scaled)

MSE = mean_squared_error(y_test_scaled, predictions)
r2 = lasso.score(X_test_scaled, y_test_scaled)
print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.23077814864424268, R2: 0.7931316896870506


In [18]:
# Ridge model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=.01).fit(X_train_scaled, y_train_scaled)
# YOUR CODE HERE
predictions = ridge.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = ridge.score(X_test_scaled, y_test_scaled)

print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.22950315102692473, R2: 0.79427459079921


In [19]:
# ElasticNet model
# Note: Use an alpha of .01 when creating the model for this activity
from sklearn.linear_model import ElasticNet
elasticnet = ElasticNet(alpha=.01).fit(X_train_scaled, y_train_scaled)

# YOUR CODE HERE
predictions = elasticnet.predict(X_test_scaled)
MSE = mean_squared_error(y_test_scaled, predictions)
r2 = elasticnet.score(X_test_scaled, y_test_scaled)
print(f"MSE: {MSE}, R2: {r2}")

MSE: 0.23023666224743816, R2: 0.7936170752255969
