In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime 
import seaborn as sb
import scipy.stats as stat
%matplotlib inline
import random

In [2]:
bikeshare = pd.read_csv('bikeshare_reduced.csv', 
                        parse_dates=['Start date', 'End date', 'start_date_short', 'end_date_short'])
bikeshare.drop('Unnamed: 0', 1, inplace=True)

## Registered Riders vs Casual Riders

In [3]:
casual = bikeshare[bikeshare['Member Type'] == 'Casual']['time_diff']
registered = bikeshare[bikeshare['Member Type'] == 'Registered']['time_diff']
casual_mean = np.mean(casual)
registered_mean = np.mean(registered)
difference = casual_mean - registered_mean
print('The difference the ride times for casual riders and the registered riders is ' + str(round(difference, 2)) + ' minutes.')

The difference the ride times for casual riders and the registered riders is 13.73 minutes.


The differences between the means are 13.73 minutes. Knowing that, we'll need to do a hypothesis test to determine is this difference was due to random chance or if they are actually different.
<br>
**Null Hypothesis**: The differences in ride time means between Registered Riders and Casual Riders are equal.
<br>
**Alternative Hypothesis**: The difference in ride time means between Registered Riders and Casual Riders are different.
<br>
**Significance**: 0.05

In [5]:
time_test = stat.ttest_ind(casual, registered, equal_var=False)
p_val = time_test[1]
print(p_val)

0.0


With this result (p-value < 0.00001), we can reject the null hypothesis that the two means between registered riders and casual riders are equal. We can argue that the means are different and that the type of rider might effect the ride time.

## Seasonal Ride Times
The next question to ask is if the season effects the ride time. By just looking at the differences in means for each of the different seasons shows that there is a difference of roughly 1 to 3 minutes

In [35]:
spring = bikeshare[bikeshare['season'] == 1]['time_diff']
summer = bikeshare[bikeshare['season'] == 2]['time_diff']
fall = bikeshare[bikeshare['season'] == 3]['time_diff']
winter = bikeshare[bikeshare['season'] == 4]['time_diff']

spring_mean = np.mean(spring)
summer_mean = np.mean(summer)
fall_mean = np.mean(fall)
winter_mean = np.mean(winter)

print('The mean ride time for the spring is ' + str(round(spring_mean, 2)) + ' minutes.')
print('The mean ride time for the summer is ' + str(round(summer_mean, 2)) + ' minutes.')
print('The mean ride time for the fall is ' + str(round(fall_mean, 2)) + ' minutes.')
print('The mean ride time for the winter is ' + str(round(winter_mean, 2)) + ' minutes.')

The mean ride time for the spring is 12.34 minutes.
The mean ride time for the summer is 15.21 minutes.
The mean ride time for the fall is 14.8 minutes.
The mean ride time for the winter is 13.27 minutes.


A hypothesis test, using the Analysis of Variance (ANOVA) test, is the best way to determine if these means are by chance or if the mean are actually different.
<br>
**Null Hypothesis**: The mean ride time for the 4 different seasons are not different.
<br>
**Alternative Hypothesis**: The mean ride time for the different seasons are different.
<br>
**Significance**: 0.05

In [6]:
season = stat.f_oneway(spring, summer, fall, winter)
p_val_season = season[1]
p_val_season

0.0

The p-value for this ANOVA is very low (less than 0.00000001) so we can reject the null hypothesis and conclude that there are differences in the mean ride time for the 4 different seasons. In order to determine which of the means are different, more hypothesis tests should be completed, using the Student's T Test. Because multiple t-tests will be done, the normal significance will of 0.05 will need to be lowered due to significance being lost. The new significance level for these tests is 
<br>
<br>
**Null Hypothesis (For each test)**: The mean ride time between 2 seasons are not different.
<br>
**Alternative Hypothesis (For each test)**: The mean ride time between 2 seasons are different.
<br>
**Significance**: 

In [34]:
spring_summer = stat.ttest_ind(spring, summer, equal_var=False)
spring_summer_p = spring_summer[1]

spring_fall = stat.ttest_ind(spring, fall, equal_var=False)
spring_fall_p = spring_fall[1]

spring_winter = stat.ttest_ind(spring, winter, equal_var=False)
spring_winter_p = spring_winter[1]

summer_fall = stat.ttest_ind(summer, fall, equal_var=False)
summer_fall_p = summer_fall[1]

summer_winter = stat.ttest_ind(summer, winter, equal_var=False)
summer_winter_p = summer_winter[1]

fall_winter = stat.ttest_ind(fall, winter, equal_var=False)
fall_winter_p = fall_winter[1]

print('The p-value for the difference between the ride times between spring and summer is ' + str(spring_summer_p) + '.')
print('The p-value for the difference between the ride times between spring and summer is ' + str(spring_fall_p) + '.')
print('The p-value for the difference between the ride times between spring and summer is ' + str(spring_winter_p) + '.')
print('The p-value for the difference between the ride times between spring and summer is ' + str(summer_fall_p) + '.')
print('The p-value for the difference between the ride times between spring and summer is ' + str(summer_winter_p) + '.')
print('The p-value for the difference between the ride times between spring and summer is ' + str(fall_winter_p) + '.')

The p-value for the difference between the ride times between spring and summer is 0.0.
The p-value for the difference between the ride times between spring and summer is 0.0.
The p-value for the difference between the ride times between spring and summer is 2.78861876229e-272.
The p-value for the difference between the ride times between spring and summer is 2.01962674215e-48.
The p-value for the difference between the ride times between spring and summer is 0.0.
The p-value for the difference between the ride times between spring and summer is 0.0.


In each t-test for the difference between seasons, the p-values are extremely low or roughly equal to 0.0. For each case, we can reject null hypothesis and claim that there are differences between each of the 4 seasons.

# Seasonal Ride Times: Casual vs Registered
In the hypothesis above, we determined that the mean ride time is different when factoring in the season. What happens when we factor in season and the member type (casual vs. registered)? Are the ride times different within the each season when broken down by the member type?

In [37]:
casual_spring = bikeshare[(bikeshare['Member Type'] == 'Casual') & (bikeshare['season'] == 1)]['time_diff']
casual_summer = bikeshare[(bikeshare['Member Type'] == 'Casual') & (bikeshare['season'] == 2)]['time_diff']
casual_fall = bikeshare[(bikeshare['Member Type'] == 'Casual') & (bikeshare['season'] == 3)]['time_diff']
casual_winter = bikeshare[(bikeshare['Member Type'] == 'Casual') & (bikeshare['season'] == 4)]['time_diff']
registered_spring = bikeshare[(bikeshare['Member Type'] == 'Registered') & (bikeshare['season'] == 1)]['time_diff']
registered_summer = bikeshare[(bikeshare['Member Type'] == 'Registered') & (bikeshare['season'] == 2)]['time_diff']
registered_fall = bikeshare[(bikeshare['Member Type'] == 'Registered') & (bikeshare['season'] == 3)]['time_diff']
registered_winter = bikeshare[(bikeshare['Member Type'] == 'Registered') & (bikeshare['season'] == 4)]['time_diff']

mean_casual_spring = np.mean(casual_spring)
mean_reg_spring = np.mean(registered_spring)
print('The difference in ride times in the spring for registered and casual riders is ' + 
      str(round(mean_casual_spring - mean_reg_spring, 2)) + ' minutes.')

mean_casual_summer = np.mean(casual_summer)
mean_reg_summer = np.mean(registered_summer)
print('The difference in ride times in the summer for registered and casual riders is ' + 
      str(round(mean_casual_summer - mean_reg_summer, 2)) + ' minutes.')

mean_casual_fall = np.mean(casual_fall)
mean_reg_fall = np.mean(registered_fall)
print('The difference in ride times in the fall for registered and casual riders is ' + 
      str(round(mean_casual_fall - mean_reg_fall, 2)) + ' minutes.')

mean_casual_winter = np.mean(casual_winter)
mean_reg_winter = np.mean(registered_winter)
print('The difference in ride times in the winter for registered and casual riders is ' + 
      str(round(mean_casual_winter - mean_reg_winter, 2)) + ' minutes.')

The difference in ride times in the spring for registered and casual riders is 14.41 minutes.
The difference in ride times in the summer for registered and casual riders is 13.79 minutes.
The difference in ride times in the fall for registered and casual riders is 13.05 minutes.
The difference in ride times in the winter for registered and casual riders is 13.41 minutes.


In looking at the differences within each season, the mean ride time is higher for casual riders and registered riders is between 13-15 minutes, which is about the same for overall ride times between the two groups. However, just looking at these differences does not actually prove that the differences are significant. A hypothesis test, using the Student's T Test, is needed to prove that these differences are not just by chance. 
<br>
<br>
**Null Hypothesis (For each test)**: The mean ride time between casual and registered riders within the same season are not different.
<br>
**Alternative Hypothesis (For each test)**: The mean ride time between casual and registered riders within the same season are different.
<br>
**Significance**: 0.05

In [38]:
reg_cas_spring = stat.ttest_ind(casual_spring, registered_spring, equal_var=False)
reg_cas_spring_p = reg_cas_spring[1]

reg_cas_summer = stat.ttest_ind(casual_summer, registered_summer, equal_var=False)
reg_cas_summer_p = reg_cas_summer[1]

reg_cas_fall = stat.ttest_ind(casual_fall, registered_fall, equal_var=False)
reg_cas_fall_p = reg_cas_fall[1]

reg_cas_winter = stat.ttest_ind(casual_winter, registered_winter, equal_var=False)
reg_cas_winter_p = reg_cas_winter[1]

print('The p-value for the t-test of mean ride times between registered and casual riders in spring is ' + 
      str(reg_cas_spring_p))
print('The p-value for the t-test of mean ride times between registered and casual riders in summer is ' + 
      str(reg_cas_summer_p))
print('The p-value for the t-test of mean ride times between registered and casual riders in fall is ' + 
      str(reg_cas_fall_p))
print('The p-value for the t-test of mean ride times between registered and casual riders in winter is ' + 
      str(reg_cas_winter_p))

The p-value for the t-test of mean ride times between registered and casual riders in spring is 0.0
The p-value for the t-test of mean ride times between registered and casual riders in summer is 0.0
The p-value for the t-test of mean ride times between registered and casual riders in fall is 0.0
The p-value for the t-test of mean ride times between registered and casual riders in winter is 0.0


The p-values for the tests were all roughly 0.0 so we can reject the null hypotheses that the means were equal. We can conclude that the means between registered riders and casual riders are different in each of the 4 seasons.

## Weather Category
The next tests done were to determine if there were differences in mean ride times based on the type of weather. Categories for the weather are sunny, cloudy/misty, and rainy/stormy. 

In [41]:
sunny = bikeshare[bikeshare['weathersit'] == 1]['time_diff']
less_sunny = bikeshare[bikeshare['weathersit'] == 2]['time_diff']
lousy = bikeshare[bikeshare['weathersit'] == 3]['time_diff']

sunny_mean = np.mean(sunny)
less_sunny_mean = np.mean(less_sunny)
lousy_mean = np.mean(lousy)

print('The mean ride time for sunny weather is ' + str(round(sunny_mean, 2)) + ' minutes.')
print('The mean ride time for cloudy/misty weather is ' + str(round(less_sunny_mean, 2)) + ' minutes.')
print('The mean ride time for rainy/stormy weather is ' + str(round(lousy_mean, 2)) + ' minutes.')

The mean ride time for sunny weather is 14.25 minutes.
The mean ride time for cloudy/misty weather is 13.65 minutes.
The mean ride time for rainy/stormy weather is 10.95 minutes.


The differences in means based on the weather are between 1-4 minutes, which is roughly the same as the differences for the mean ride times for the different seasons. A hypothesis test, using ANOVA, will be conducted to determine if there are actual differences or if the differences happened by chance.
<br>
<br>
**Null Hypothesis**: The mean ride time for the 3 different weather categories are not different.
<br>
**Alternative Hypothesis**: The mean ride time for the 3 different weather categories are different.
<br>
**Significance**: 0.05

In [42]:
weather_anova = stat.f_oneway(sunny, less_sunny, lousy)
weather_anova_p = weather_anova[1]
print('The p-value for the ANOVA test of whether mean ride times based on the 3 different weather categories is ' +
      str(weather_anova_p))

The p-value for the ANOVA test of whether mean ride times based on the 3 different weather categories is 0.0


The p-value in this scenario is roughly equal to 0.0 so we can reject the null hypothesis that there are no differences between the different weather categories. Additional T tests need to be done to determine which means are different.
<br>
<br>
**Null Hypothesis (For each test)**: The mean ride time between the 3 weather categories are not different.
<br>
**Alternative Hypothesis (For each test)**: The mean ride time between the 3 weather categories are different.
<br>
**Significance**: 

In [43]:
sunny_less = stat.ttest_ind(sunny, less_sunny, equal_var=False)
sunny_less_p = sunny_less[1]

sunny_lousy = stat.ttest_ind(sunny, lousy, equal_var=False)
sunny_lousy_p = sunny_lousy[1]

less_lousy = stat.ttest_ind(less_sunny, lousy, equal_var=False)
less_lousy_p = less_lousy[1]

print('The p-value for the test between the mean ride times for sunny and cloudy/misty weather is ' + str(sunny_less_p))
print('The p-value for the test between the mean ride times for sunny and rainy/snowy/stormy weather is ' + str(sunny_lousy_p))
print('The p-value for the test between the mean ride times for cloudy and rainy/snowy/stormy weather is ' + str(less_lousy_p))

The p-value for the test between the mean ride times for sunny and cloudy/misty weather is 2.80975478368e-185
The p-value for the test between the mean ride times for sunny and rainy/snowy/stormy weather is 0.0
The p-value for the test between the mean ride times for cloudy and rainy/snowy/stormy weather is 0.0


The p-values for each scenario are relatively close to 0.0 or are 0.0, so we can reject the null hypotheses that the means are the same for each of the different categories. 

## Weather Category: Casual vs Registered
In the test above, we determined that there are differences between the ride times based on the different weather categories. The next set of tests show what happens when the member type is factored into the weather categories.

In [47]:
sunny_casual = bikeshare[(bikeshare['weathersit'] == 1) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
sunny_reg = bikeshare[(bikeshare['weathersit'] == 1) & (bikeshare['Member Type'] == 'Registered')]['time_diff']
sunny_casual_mean = np.mean(sunny_casual)
sunny_reg_mean = np.mean(sunny_reg)
print('The difference between mean ride times between casual riders and registered riders on sunny days is ' + 
     str(round(sunny_casual_mean - sunny_reg_mean, 2)) + ' minutes.')

less_casual = bikeshare[(bikeshare['weathersit'] == 2) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
less_reg = bikeshare[(bikeshare['weathersit'] == 2) & (bikeshare['Member Type'] == 'Registered')]['time_diff']
less_casual_mean = np.mean(less_casual)
less_reg_mean = np.mean(less_reg)
print('The difference between mean ride times between casual riders and registered riders on cloudy/misty days is ' + 
     str(round(less_casual_mean - less_reg_mean, 2)) + ' minutes.')

lousy_casual = bikeshare[(bikeshare['weathersit'] == 3) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
lousy_reg = bikeshare[(bikeshare['weathersit'] == 3) & (bikeshare['Member Type'] == 'Registered')]['time_diff']
lousy_casual_mean = np.mean(lousy_casual)
lousy_reg_mean = np.mean(lousy_reg)
print('The difference between mean ride times between casual riders and registered riders on rainy/stormy/snowy days is ' + 
     str(round(lousy_casual_mean - lousy_reg_mean, 2)) + ' minutes.')

The difference between mean ride times between casual riders and registered riders on sunny days is 13.86 minutes.
The difference between mean ride times between casual riders and registered riders on cloudy/misty days is 13.5 minutes.
The difference between mean ride times between casual riders and registered riders on rainy/stormy/snowy days is 8.46 minutes.


The differences in ride times between the member type and weather categories fall between 8.5 and 14 minutes. Hypothesis tests will be done to determine where the differences occur.
<br>
<br>
**Null Hypothesis (For each test)**: The mean ride time between casual and registered riders within the same weather category are not different.
<br>
**Alternative Hypothesis (For each test)**: The mean ride time between casual and registered riders within the same weather category are different.
<br>
**Significance**: 0.05

In [48]:
sunny_diff = stat.ttest_ind(sunny_casual, sunny_reg, equal_var=False)
sunny_diff_p = sunny_diff[1]

less_diff = stat.ttest_ind(less_casual, less_reg, equal_var=False)
less_diff_p = less_diff[1]

lousy_diff = stat.ttest_ind(lousy_casual, lousy_reg, equal_var=False)
lousy_diff_p = lousy_diff[1]

print('The p-value for the difference in mean ride times between casual and registered riders on sunny days is ' +
      str(sunny_diff_p))
print('The p-value for the difference in mean ride times between casual and registered riders on cloudy/misty days is ' +
      str(less_diff_p))
print('The p-value for the difference in mean ride times between casual and registered riders on rainy/stormy/snowy days is ' +
      str(lousy_diff_p))

The p-value for the difference in mean ride times between casual and registered riders on sunny days is 0.0
The p-value for the difference in mean ride times between casual and registered riders on cloudy/misty days is 0.0
The p-value for the difference in mean ride times between casual and registered riders on rainy/stormy/snowy days is 8.05155049416e-105


The p-values are relatively close to zero, therefore we can reject null hypothesis in favor of the alternative. There are differences in the mean ride times for casual and registered riders based on the weather category.

## Holiday
Based on the visual exploratory analysis done, there seemed to be a difference in average ride times for holidays and non-holidays. The analysis below looks to determine if this difference is by chance or if there is actually a difference.

In [50]:
no_holiday = bikeshare[bikeshare['holiday'] == 0]['time_diff']
holiday = bikeshare[bikeshare['holiday'] == 1]['time_diff']

no_holiday_mean = np.mean(no_holiday)
holiday_mean = np.mean(holiday)
diff = holiday_mean - no_holiday_mean
print('The difference between the mean ride times for holidays and non-holidays is ' + str(round(diff, 2)) + ' minutes.')

The difference between the mean ride times for holidays and non-holidays is 1.59 minutes.


Although the differences in means is about 1.6 minutes, a hypothesis test needs to be conducted to determine if there actually is a difference in the mean ride times based on holidays and non-holidays.
<br>
<br>
**Null Hypothesis**: The mean ride time for holidays and non-holidays are not different.
<br>
**Alternative Hypothesis**: The mean ride time for holidays and non-holidays are different.
<br>
**Significance**: 0.05

In [53]:
holiday_t = stat.ttest_ind(no_holiday, holiday, equal_var=False)
holiday_t_p = holiday_t[1]
print(holiday_t_p)

5.94089337013e-98


## Holiday: Casual vs Registered
How does the member type effect the ride time on holidays and non-holidays? The hypothesis tests below determine whether there are differences.

In [52]:
no_hol_cas = bikeshare[(bikeshare['holiday'] == 0) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
no_hol_reg = bikeshare[(bikeshare['holiday'] == 0) & (bikeshare['Member Type'] == 'Registered')]['time_diff']
hol_cas = bikeshare[(bikeshare['holiday'] == 1) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
hol_reg = bikeshare[(bikeshare['holiday'] == 1) & (bikeshare['Member Type'] == 'Registered')]['time_diff']

no_hol_cas_mean = np.mean(no_hol_cas)
no_hol_reg_mean = np.mean(no_hol_reg)
print('The difference between mean ride times between member types on a holiday is ' + 
     str(round(no_hol_cas_mean - no_hol_reg_mean, 2)) + ' minutes.')

hol_cas_mean = np.mean(hol_cas)
hol_reg_mean = np.mean(hol_reg)
print('The difference between mean ride times between member types on non-holidays is ' + 
     str(round(hol_cas_mean - hol_reg_mean, 2)) + ' minutes.')

The difference between mean ride times between member types on a holiday is 13.69 minutes.
The difference between mean ride times between member types on non-holidays is 14.74 minutes.


In [17]:
no_hol_t = stat.ttest_ind(no_hol_cas, no_hol_reg, equal_var=False)
no_hol_t_p = no_hol_t[1]

hol_t = stat.ttest_ind(hol_cas, hol_reg, equal_var=False)
hol_t_p = hol_t[1]

print(no_hol_t_p)
print(hol_t_p)

0.0
0.0


## Workday

In [18]:
workday = bikeshare[bikeshare['workingday'] == 1]['time_diff']
no_workday = bikeshare[bikeshare['workingday'] == 0]['time_diff']


In [19]:
workday_t = stat.ttest_ind(workday, no_workday, equal_var=False)
workday_t_p = workday_t[1]

print(workday_t_p)

0.0


## Workday: Registered vs Casual

In [20]:
no_work_cas = bikeshare[(bikeshare['workingday'] == 0) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
no_work_reg = bikeshare[(bikeshare['workingday'] == 0) & (bikeshare['Member Type'] == 'Registered')]['time_diff']
work_cas = bikeshare[(bikeshare['workingday'] == 1) & (bikeshare['Member Type'] == 'Casual')]['time_diff']
work_reg = bikeshare[(bikeshare['workingday'] == 1) & (bikeshare['Member Type'] == 'Registered')]['time_diff']

In [21]:
no_working = stat.ttest_ind(no_work_cas, no_work_reg, equal_var=False)
no_working_p = no_working[1]

working_t = stat.ttest_ind(work_cas, work_reg, equal_var=False)
working_t_p = working_t[1]

print(no_working_p)
print(working_t_p)

0.0
0.0


## Ride Times vs. Temperature

In [22]:
time_temp = np.corrcoef(bikeshare['temp'], bikeshare['time_diff'])
time_temp[0,1]

0.095882386933345623

In [23]:
casual_corr = bikeshare[bikeshare['Member Type'] == 'Casual']
registered_corr = bikeshare[bikeshare['Member Type'] == 'Registered']

In [24]:
time_temp_cas = np.corrcoef(casual_corr['temp'], casual_corr['time_diff'])
time_temp_cas[0,1]

0.0076346430014268855

In [25]:
time_temp_reg = np.corrcoef(registered_corr['temp'], registered_corr['time_diff'])
time_temp_reg[0,1]

0.083729130500907911

## Ride Times vs. Humidity

In [26]:
time_hum = np.corrcoef(bikeshare['hum'], bikeshare['time_diff'])
time_hum[0,1]

0.008031838150257169

In [27]:
time_hum_cas = np.corrcoef(casual_corr['hum'], casual_corr['time_diff'])
time_hum_cas[0,1]

-0.0041247100896568334

In [28]:
time_hum_reg = np.corrcoef(registered_corr['hum'], registered_corr['time_diff'])
time_hum_reg[0,1]

0.0018546164180973683

## Ride Times vs Wind Speed

In [29]:
time_wind = np.corrcoef(bikeshare['windspeed'], bikeshare['time_diff'])
time_wind[0,1]

-0.031826321312461395

In [30]:
time_wind_cas = np.corrcoef(casual_corr['windspeed'], casual_corr['time_diff'])
time_wind_cas[0,1]

-0.011978496978161309

In [31]:
time_wind_reg = np.corrcoef(registered_corr['windspeed'], registered_corr['time_diff'])
time_wind_reg[0,1]

-0.01943016317449368