<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week10/feature_engineering_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering Exercise
- Michael Vincent
- 8/21/22

## Imports

In [58]:
# Imports
import numpy as np
import pandas as pd

## Load the data

In [59]:
# Load the data
url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSWcs7d0Hz9D4QsdQrMVoYA1jH7uRiYk2SzPr0AH6gB0FyqphhumdJAM4ga-Ebg9vzfKGmW751pXHJ2/pub?output=csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Clean the data

In [60]:
# Check for missing values and duplicates
print('Duplicates:', df.duplicated().sum())
print('Missing Values:', df.isna().sum().sum())

Duplicates: 0
Missing Values: 0


## Process the data

In [61]:
# Make a copy of the data to perform
# feature engineering on.
fe_df = df.copy()
fe_df['datetime'] = pd.to_datetime(fe_df['datetime'])
fe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   casual      10886 non-null  int64         
 10  registered  10886 non-null  int64         
 11  count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(8)
memory usage: 1020.7 KB


In [62]:
# Create new columns for month, day of the week, and hour
fe_df['month'] = fe_df['datetime'].dt.month_name()
fe_df['day'] = fe_df['datetime'].dt.day_name()
fe_df['hour'] = fe_df['datetime'].dt.hour.astype('object')

# Drop the datetime and season columns
# as they are now redundant.
fe_df.drop(columns = ['datetime', 'season'], inplace = True)

# Make sure the changes were made
fe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   holiday     10886 non-null  int64  
 1   workingday  10886 non-null  int64  
 2   weather     10886 non-null  int64  
 3   temp        10886 non-null  float64
 4   atemp       10886 non-null  float64
 5   humidity    10886 non-null  int64  
 6   windspeed   10886 non-null  float64
 7   casual      10886 non-null  int64  
 8   registered  10886 non-null  int64  
 9   count       10886 non-null  int64  
 10  month       10886 non-null  object 
 11  day         10886 non-null  object 
 12  hour        10886 non-null  object 
dtypes: float64(3), int64(7), object(3)
memory usage: 1.1+ MB


In [63]:
# Convert the temperatures to degrees Fahrenheit
fe_df[['temp', 'atemp']] = fe_df[['temp', 'atemp']].apply(lambda x: 1.8 * x + 32)
# Make sure the changes were applied
fe_df[['temp', 'atemp']].head()

Unnamed: 0,temp,atemp
0,49.712,57.911
1,48.236,56.543
2,48.236,56.543
3,49.712,57.911
4,49.712,57.911


In [64]:
# Make a new column temp_variance that is the 
# difference of temp and atemp.
fe_df['temp_variance'] = fe_df['temp'] - fe_df['atemp']
# Drop the atemp column as it is now redundant.
fe_df.drop(columns = 'atemp', inplace = True)
# Make sure the changes were made.
fe_df.head()

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,count,month,day,hour,temp_variance
0,0,0,1,49.712,81,0.0,3,13,16,January,Saturday,0,-8.199
1,0,0,1,48.236,80,0.0,8,32,40,January,Saturday,1,-8.307
2,0,0,1,48.236,80,0.0,5,27,32,January,Saturday,2,-8.307
3,0,0,1,49.712,75,0.0,3,10,13,January,Saturday,3,-8.199
4,0,0,1,49.712,75,0.0,0,1,1,January,Saturday,4,-8.199
