# Feature Engineering Exercise (Core)
- Student: Michael McCann
- Date: 30 MAR 2022

In [230]:
## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')


In [231]:
## Improt Data
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vROUXPkYUkX-2W7JbJ0-oNKaXzpg4NtmU9IeWEY6yFKm32ZEJOpRh_soHD4BeIcuHjYik3SEoXmkgwj/pub?output=csv'
df = pd.read_csv(url)

df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [232]:
# Drop redundant columns (casual / registered).
df.drop(columns = ['casual', 'registered'], inplace=True)

In [233]:
# create engineer DF
df_eng = df.copy()

# Convert datetime... make columns for month, day, and hour.
df_eng['datetime'] = pd.to_datetime(df_eng['datetime'])
df_eng['month (name)'] = df_eng['datetime'].dt.month_name()
df_eng['day (name)'] = df_eng['datetime'].dt.day_name()
df_eng['hour'] = df_eng['datetime'].dt.hour

# Drop datetime and season column as they are no longer necessary
df_eng.drop(columns = ['datetime', 'season'], inplace = True)

display(df_eng.head())
print('\n\n')
df_eng.info()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month (name),day (name),hour
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   holiday       10886 non-null  int64  
 1   workingday    10886 non-null  int64  
 2   weather       10886 non-null  int64  
 3   temp          10886 non-null  float64
 4   atemp         10886 non-null  float64
 5   humidity      10886 non-null  int64  
 6   windspeed     10886 non-null  float64
 7   count         10886 non-null  int64  
 8   month (name)  10886 non-null  object 
 9   day (name)    10886 non-null  object 
 10  hour          10886 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 935.6+ KB


3. The temperatures in the 'temp' and 'atemp' column are in Celsius.  Use `.apply()` to convert them to Fahrenheit.



In [234]:
df_eng['temp'] = df_eng['temp'].apply(lambda x: round(x*1.8+32,1))
df_eng['atemp'] = df_eng['atemp'].apply(lambda x: round(x*1.8+32, 1))

4. Create a new column, 'temp_variance' that is the difference between 'temp' and 'atemp'.  Drop the 'atemp' column.

In [235]:
df_eng['temp_var'] = df_eng['atemp'] - df_eng['temp']
df_eng.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month (name),day (name),hour,temp_var
0,0,0,1,49.7,57.9,81,0.0,16,January,Saturday,0,8.2
1,0,0,1,48.2,56.5,80,0.0,40,January,Saturday,1,8.3
2,0,0,1,48.2,56.5,80,0.0,32,January,Saturday,2,8.3
3,0,0,1,49.7,57.9,75,0.0,13,January,Saturday,3,8.2
4,0,0,1,49.7,57.9,75,0.0,1,January,Saturday,4,8.2


In [236]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


In [237]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)

num_sel = make_column_selector(dtype_include= 'number')
cat_sel = make_column_selector(dtype_include= 'object')

num_tuple = (scaler, num_sel)
cat_tuple = (ohe, cat_sel)

preprocessor = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

In [238]:
X = df.drop(columns = 'count')
y = df['count']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [239]:
%%time
rf = RandomForestRegressor()
rf_pipe = make_pipeline(preprocessor, rf)
rf_pipe.fit(X_train, y_train)

print(f'RF without Engineering (Training): {rf_pipe.score(X_train, y_train):.3f}')
print(f'RF Regression without Engineering (Testing): {rf_pipe.score(X_test, y_test):.3f}')

RF without Engineering (Training): 0.898
RF Regression without Engineering (Testing): 0.243
CPU times: user 4min 37s, sys: 241 ms, total: 4min 37s
Wall time: 4min 37s


In [240]:
X = df_eng.drop(columns = 'count')
y = df_eng['count']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [241]:
%%time
rf = RandomForestRegressor()
rf_pipe = make_pipeline(preprocessor, rf)
rf_pipe.fit(X_train, y_train)

print(f'RF with Engineering (Training): {rf_pipe.score(X_train, y_train):.3f}')
print(f'RF with Engineering (Testing): {rf_pipe.score(X_test, y_test):.3f}')

RF with Engineering (Training): 0.981
RF with Engineering (Testing): 0.865
CPU times: user 4.2 s, sys: 10.8 ms, total: 4.21 s
Wall time: 4.21 s
