In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import warnings


warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style('whitegrid')

### Bike Share

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv
	
	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered
    
    
#### Using the model, make predictions of the demand for bikes when the windspeed is minimum and maximum. Plot the ROC curve.

In [3]:
bike_day = pd.read_csv('C:\\Users\\mpagrawa\\Desktop\\Training\\Data Science\\Acad\\BootCamp\\Sessions\\Session24\\Bike-Sharing-Dataset\\day.csv', parse_dates=['dteday'])
bike_hour = pd.read_csv('C:\\Users\\mpagrawa\\Desktop\\Training\\Data Science\\Acad\\BootCamp\\Sessions\\Session24\\Bike-Sharing-Dataset\\hour.csv', parse_dates=['dteday'])

In [4]:
bike_day.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [5]:
#instant column can be deleted as it is serving as an index
# season, yr, mnth, holiday, weekday, workingday, weathersit are caegorical columns

In [6]:
bike_day.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,366.0,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,315.5,2497.0,3152.0
50%,366.0,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


In [7]:
bike_day.isna().mean()
#No missing data

instant       0.0
dteday        0.0
season        0.0
yr            0.0
mnth          0.0
holiday       0.0
weekday       0.0
workingday    0.0
weathersit    0.0
temp          0.0
atemp         0.0
hum           0.0
windspeed     0.0
casual        0.0
registered    0.0
cnt           0.0
dtype: float64

In [8]:
bike_day = pd.get_dummies(bike_day, columns=['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit'], drop_first=True)

In [9]:
bike_day.head()

Unnamed: 0,instant,dteday,temp,atemp,hum,windspeed,casual,registered,cnt,season_2,...,holiday_1,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3
0,1,2011-01-01,0.344167,0.363625,0.805833,0.160446,331,654,985,0,...,0,0,0,0,0,0,1,0,1,0
1,2,2011-01-02,0.363478,0.353739,0.696087,0.248539,131,670,801,0,...,0,0,0,0,0,0,0,0,1,0
2,3,2011-01-03,0.196364,0.189405,0.437273,0.248309,120,1229,1349,0,...,0,1,0,0,0,0,0,1,0,0
3,4,2011-01-04,0.2,0.212122,0.590435,0.160296,108,1454,1562,0,...,0,0,1,0,0,0,0,1,0,0
4,5,2011-01-05,0.226957,0.22927,0.436957,0.1869,82,1518,1600,0,...,0,0,0,1,0,0,0,1,0,0


In [10]:
bike_day.isna().mean()

instant         0.0
dteday          0.0
temp            0.0
atemp           0.0
hum             0.0
windspeed       0.0
casual          0.0
registered      0.0
cnt             0.0
season_2        0.0
season_3        0.0
season_4        0.0
yr_1            0.0
mnth_2          0.0
mnth_3          0.0
mnth_4          0.0
mnth_5          0.0
mnth_6          0.0
mnth_7          0.0
mnth_8          0.0
mnth_9          0.0
mnth_10         0.0
mnth_11         0.0
mnth_12         0.0
holiday_1       0.0
weekday_1       0.0
weekday_2       0.0
weekday_3       0.0
weekday_4       0.0
weekday_5       0.0
weekday_6       0.0
workingday_1    0.0
weathersit_2    0.0
weathersit_3    0.0
dtype: float64

In [11]:
#bike_day['dteday'] = pd.datetime.strptime(bike_day['dteday'])
mindate = min(bike_day.dteday)
type(bike_day.dteday[0]), mindate

(pandas._libs.tslibs.timestamps.Timestamp, Timestamp('2011-01-01 00:00:00'))

In [12]:
bike_day['dteday'] = bike_day['dteday'].apply(lambda x: ((x - mindate).days))

In [13]:
bike_day.head()

Unnamed: 0,instant,dteday,temp,atemp,hum,windspeed,casual,registered,cnt,season_2,...,holiday_1,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3
0,1,0,0.344167,0.363625,0.805833,0.160446,331,654,985,0,...,0,0,0,0,0,0,1,0,1,0
1,2,1,0.363478,0.353739,0.696087,0.248539,131,670,801,0,...,0,0,0,0,0,0,0,0,1,0
2,3,2,0.196364,0.189405,0.437273,0.248309,120,1229,1349,0,...,0,1,0,0,0,0,0,1,0,0
3,4,3,0.2,0.212122,0.590435,0.160296,108,1454,1562,0,...,0,0,1,0,0,0,0,1,0,0
4,5,4,0.226957,0.22927,0.436957,0.1869,82,1518,1600,0,...,0,0,0,1,0,0,0,1,0,0


In [14]:
bike_hour.describe()
#No missing data

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [15]:
del bike_day['instant']
del bike_day['casual']
del bike_day['registered']

del bike_hour['instant']
del bike_hour['casual']
del bike_hour['registered']

In [16]:
X_day = bike_day.drop(['cnt'], axis=1)
y_day = bike_day.cnt

In [17]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn import metrics

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_day, y_day, test_size=0.3)

In [19]:
y_test.shape

(220,)

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
lm = LinearRegression()

In [22]:
lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [23]:
y_pred = lm.predict(X_test)

In [24]:
y_pred

array([3180.7546392 , 4120.9245436 , 4071.43404855, 6214.03473645,
       4398.79213186, 3870.07870043, 4853.08081291, 5368.45242284,
       5587.55960765, 3890.30324209, 1818.26131851, 6420.92542493,
       2880.97335171, 4623.68322386, 7046.97416816, 6019.98013868,
       4003.31087527, 6319.48896691, 3778.19950395, 7181.57651982,
       5129.54438899, 5563.68605902, 4846.79632974, 2240.5526768 ,
       3027.00512879, 4159.80474576, 4739.59181095, 4800.36113564,
       1058.38470878, 3360.05081784, 1742.61656886, 6485.00361268,
       4245.46451128, 1177.05917451, 1815.20967296, 5430.32625312,
       3146.76954159, 3307.76042479, 4580.73868573, 5133.08937677,
       4495.3022681 , 1999.91193316, 4658.97484121, 5139.33511928,
       7368.36125464, 5200.38379686, 5646.00585192, 3457.65809815,
       5031.86505324, 2561.28731126, 5502.67520132,  924.25187872,
       4918.11391418, 3607.82912742, 6301.75296311, 6606.79695566,
       1628.96791741, 6858.20528652, 4327.30085204, 4152.06543

In [25]:
lm.score(X_test, y_test)

0.8226988518685621