In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler,OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

from sklearn import set_config
set_config(transform_output='pandas')
pd.set_option('display.max_columns',100)

In [3]:
# Load Ames for ML
fpath = "Data/bikeshare_train - bikeshare_train.csv"
df = pd.read_csv(fpath)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
# drop the 'casual' and 'registered' columns
df = df.drop(columns = ['casual', 'registered'])
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,1


In [6]:
# Transform the 'datetime' column into a datetime type  
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 850.6 KB


In [9]:
# use datetime column to create 3 new columns in the data frame containing the: Name of the Month, Name of the Day of the Week, Hour of the Day. 
# Make sure all 3 new columns are 'object' datatype
df['Name of the Month'] = df['datetime'].dt.month_name().astype('str')
df['Name of the Day of the Week'] = df['datetime'].dt.day_name().astype('str')
df['Hour of the Day'] = df['datetime'].dt.hour.astype('str')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Name of the Month,Name of the Day of the Week,Hour of the Day
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


In [10]:
# Drop the 'datetime' and 'season' columns
df = df.drop(columns = ['datetime', 'season'])
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Name of the Month,Name of the Day of the Week,Hour of the Day
0,0,0,1,9.84,14.395,81,0.0,16,January,Saturday,0
1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday,1
2,0,0,1,9.02,13.635,80,0.0,32,January,Saturday,2
3,0,0,1,9.84,14.395,75,0.0,13,January,Saturday,3
4,0,0,1,9.84,14.395,75,0.0,1,January,Saturday,4


In [11]:
# The temperatures in the 'temp' and 'atemp' columns are in Celsius. Use `.apply()` and a Lambda function to convert them to Fahrenheit.

# Convert the 'temp' column to Fahrenheit 
df['temp'] = df['temp'].apply(lambda x: (x * 9/5) + 32 )

# Convert the 'atemp' column to Fahrenheit 
df['atemp'] = df['atemp'].apply(lambda x: (x * 9/5) + 32 )

df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Name of the Month,Name of the Day of the Week,Hour of the Day
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4


In [12]:
# Create a new column, 'temp_variance',
# which shows how much warmer or colder the current temperature ('temp') is than the average temperate for that day of the year ('atemp'). 
# If the current temperature is warmer than average ('atemp'), the value in 'temp_variance' should be positive.

df['temp_variance'] = df['temp'] - df['atemp']
df['temp_variance'] = df['temp_variance'].apply(lambda x: 'warmer' if x>0 else 'colder')

df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,Name of the Month,Name of the Day of the Week,Hour of the Day,temp_variance
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday,0,colder
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday,1,colder
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday,2,colder
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday,3,colder
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday,4,colder


In [13]:
# Drop the 'atemp' column.
df = df.drop(columns = ['atemp'])
df.head()

Unnamed: 0,holiday,workingday,weather,temp,humidity,windspeed,count,Name of the Month,Name of the Day of the Week,Hour of the Day,temp_variance
0,0,0,1,49.712,81,0.0,16,January,Saturday,0,colder
1,0,0,1,48.236,80,0.0,40,January,Saturday,1,colder
2,0,0,1,48.236,80,0.0,32,January,Saturday,2,colder
3,0,0,1,49.712,75,0.0,13,January,Saturday,3,colder
4,0,0,1,49.712,75,0.0,1,January,Saturday,4,colder
