In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# Load dataset
penguins = sns.load_dataset("penguins")

# Examine first 5 rows of dataset
penguins.head()

Data Cleaning

In [None]:
# Keep Adelie and Gentoo penguins, drop missing values
penguins_sub = penguins[penguins["species"] != "Chainstrap"]
penguins_final = penguins_sub.dropna()
penguins_final.reset_index(inplace = True, drop=True)

Exploratory Data Analysis

In [None]:
# Create pairwise scatterplots of data set
sns.pairplot(penguins_final)

Model Construction

In [None]:
# Subset Data
ols_data = penguins_final[["bill_length_mm", "body_mass_g"]]

#Write out the formula
ols_formula = "body_mass_g ~ bill_length_mm"

#import ols function
from statmodels.formula.api import ols

# Build OLS fit model to data
OLS = ols(formula = ols_formula, data = ols_data)
model = OLS.fit

model.summary()

sns.regplot(x = "bill_length_mm", y = "body_mass_g", data = ols_data)

Finishing checking model assumption

In [None]:
# Subset X variable
X = ols_data["bill_length_mm"]

#Get predictions from model
fitted_values = model.predict(X)

# Calculate residuals
residuals = model.resid

Check the normality assumption

In [None]:
import matplotlib.pyplot as plt
fig = sns.histplot(residuals)
fig.set_xlabel("Residual Value")
fig.set_title("Histogram of Residuals")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
fig = sm.qqplot(model.resid, line = 's')
plt.show

Check the homoscedasticity assumption

In [None]:
#import matplotlib
import matplotlib.pyplot as plt
fig = sns.scatterplot(x = fitted_values, y = residuals)

#add reference line at residuals = 0
fig.axhline(0)

#Set x-axis and y-axis labels
fig.set_xlabel("Fitted Values")
fig.set_ylabel("Residuals")

#Show the plot
plt.show()