In [1]:
# Dependencies
import pandas as pd
import plotly.express as plt
import os
import statsmodels.api as sm

In [2]:
bikecounts_file = os.path.join('../','Resources', 'bikecounts.csv')
bikecounts_df = pd.read_csv(bikecounts_file)
bikecounts_df['Day'] = pd.to_datetime(bikecounts_df['Day'])
bikecounts_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Day,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total
0,0,2016-04-01,2016-04-01,78.1,66.0,0.01,1704.0,3126,4115.0,2552.0,11497
1,1,2016-04-02,2016-04-02,55.0,48.9,0.15,827.0,1646,2565.0,1884.0,6922
2,2,2016-04-03,2016-04-03,39.9,34.0,0.09,526.0,1232,1695.0,1306.0,4759
3,3,2016-04-04,2016-04-04,44.1,33.1,0.47 (S),521.0,1067,1440.0,1307.0,4335
4,4,2016-04-05,2016-04-05,42.1,26.1,0,1416.0,2617,3081.0,2357.0,9471


In [3]:
bikecounts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Unnamed: 0           30 non-null     int64         
 1   Date                 30 non-null     object        
 2   Day                  30 non-null     datetime64[ns]
 3   High Temp (°F)       30 non-null     float64       
 4   Low Temp (°F)        30 non-null     float64       
 5   Precipitation        30 non-null     object        
 6   Brooklyn Bridge      30 non-null     float64       
 7   Manhattan Bridge     30 non-null     int64         
 8   Williamsburg Bridge  30 non-null     float64       
 9   Queensboro Bridge    30 non-null     float64       
 10  Total                30 non-null     int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(2)
memory usage: 2.7+ KB


In [4]:
#The "Date" column is changed to a datetime format and is compared to the "Day" column. They are equal hence one
#of them can be dropped
bikecounts_df["Date"] = pd.to_datetime(bikecounts_df["Date"])
print(all(bikecounts_df["Date"] == bikecounts_df["Day"]))
bikecounts_df = bikecounts_df.drop(labels=["Day", "Unnamed: 0"], axis=1)

#the next step is to clean the data. There are some unwanted characters in the "Precipitation" column which will
#make the column an onject instead of a float, and we cannot find the relationship between the column and our target

#this function will be used to clean up the "Precipitation" column
def cleanString(string):
    string = string.replace({"T": "0", "0.47 (S)": "0.47"})       
    return string

#cleaning the values of the "Precipitation" column and changing it to type "float"
bikecounts_df["Precipitation"] = cleanString(bikecounts_df["Precipitation"])
bikecounts_df["Precipitation"] = bikecounts_df["Precipitation"].astype("float64")

True


In [5]:
#We can find the average temperatue 
bikecounts_df["Average Temp (°F)"] = (bikecounts_df["High Temp (°F)"] + bikecounts_df["Low Temp (°F)"])/2
bikecounts_df.head()

Unnamed: 0,Date,High Temp (°F),Low Temp (°F),Precipitation,Brooklyn Bridge,Manhattan Bridge,Williamsburg Bridge,Queensboro Bridge,Total,Average Temp (°F)
0,2016-04-01,78.1,66.0,0.01,1704.0,3126,4115.0,2552.0,11497,72.05
1,2016-04-02,55.0,48.9,0.15,827.0,1646,2565.0,1884.0,6922,51.95
2,2016-04-03,39.9,34.0,0.09,526.0,1232,1695.0,1306.0,4759,36.95
3,2016-04-04,44.1,33.1,0.47,521.0,1067,1440.0,1307.0,4335,38.6
4,2016-04-05,42.1,26.1,0.0,1416.0,2617,3081.0,2357.0,9471,34.1


## Hypothesis
The number of bikes that cross the Brooklyn Bridge everyday depends on the Temperature and the Precipitation on that day such that, the higher the temperature and the lower the precipitation, the higher the number of bikes that cross the Brooklyn Bridge

## Null Hypothesis
If the number of bikes that cross the Brooklyn Bridge is not dependent on the Temperature and the Precipitation, then an increase in temperaure and a decrease in precipitation will not cause a high number of bikes to cross the Brooklyn Bridge.

In [6]:
#Using linear Regression to test the hypothesis
X = bikecounts_df[["Precipitation", "Average Temp (°F)"]]
y = bikecounts_df["Brooklyn Bridge"]
X = sm.add_constant(X)
reg_model = sm.OLS(y, X)
results = reg_model.fit()

  x = pd.concat(x[::order], 1)


In [7]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:        Brooklyn Bridge   R-squared:                       0.605
Model:                            OLS   Adj. R-squared:                  0.576
Method:                 Least Squares   F-statistic:                     20.70
Date:                Fri, 08 Oct 2021   Prob (F-statistic):           3.55e-06
Time:                        21:49:27   Log-Likelihood:                -235.22
No. Observations:                  30   AIC:                             476.4
Df Residuals:                      27   BIC:                             480.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              -237.9800    697.75

There exists a strong correlation between the independent variables and target variable. Looking at the P values for both "Precipitation"(0.001) and "Average Temperature" (0.000), we can assert that it is safe to reject the null hypothesis. 