

# Feature Engineering and Overfitting

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np

url = "https://raw.githubusercontent.com/middcs/data-science-notes/refs/heads/main/data/e-coli/ecoli-huntington.csv"
df = pd.read_csv(url)
df.columns

From the data dictionary: 

- `EcoliAve_CFU`: Escherichia coli concentration measured in a discrete sample collected by Cuyahoga County Board of Health, Ohio, and analyzed by the Northeast Ohio Regional Sewer District, Ohio, USGS parameter codes 90902 or 50468.  This variable needs to be log10 transformed for modeling.
- `Lake_Temp_C`: Lake temperature in degrees Celsius
- `WaveHt_Ft`: Wave height, measured with a yardstick at the site at the same time the E. coli sample was collected, USGS parameter code 70224.  The square root of this variable is used in the model.
- `Lake_Turb_NTRU`: *Turbidity, measured with a portable turbidimeter from a sample collected at the same time the E. coli sample was collected, USGS parameter code 63676. The log10 of this variable is used in the model.*
- `LL_PreDay`: Lake level change over the last 24 hours (8 a.m. today-8 a.m. yesterday), NOAA Station 9063063 at Cleveland, Ohio., accessed at https://tidesandcurrents.noaa.gov/
- `AirportRain48W_in`: Rainfall, cumulated sums for the past 48 hours, giving the most weight to the most recent value, as follows: (2 * Dm1) + Dm2.  DM1 is the 24-hour total up to 8 a.m. on the day the E. coli sample was collected; DM2 is the rain 2 days ago, Cleveland-Hopkins International Airport, Ohio, site USW00014820, accessed at https://www.ncdc.noaa.gov/. The square root of this variable is used in the model.


In [None]:
df["log_ecoli"] = np.log(1 + df["EcoliAve_CFU"])
df.columns
sns.pairplot(df, diag_kind="kde")

In [None]:
LR = LinearRegression()
predictor_columns = ['Lake_Temp_C', 'Lake_Turb_NTRU', 'WaveHt_Ft',
       'LL_PreDay', 'AirportRain48W_in']
X = df[predictor_columns]
y = df['log_ecoli']

LR.fit(X, y)
LR.score(X, y)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', PolynomialFeatures(degree=4, include_bias=False)),
    ('estimator', LinearRegression())
])

model = pipeline.fit(X, y)

model.score(X, y)