# Chicago Bike Sharing Exploration

## Set-up

First we import the required libraries.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import timedelta

Then we read in the dataset of the Chicago bike-sharing company of the year 2018.

In [None]:
data_messy = pd.read_csv("chicago_2018.csv")

## Some first insights

I want to see some first samples:

In [None]:
data_messy.head(5)

Now I am interested into how many observations there are within the dataset.

In [None]:
data_messy.count()

I want to know the time of the first and last measurement.

In [None]:
min = data_messy["start_time"].min()
max = data_messy["end_time"].max()

print("The measurements took place from", min, "until", max + ".")

How many *start* and *end* stations are there?

In [None]:
start_stations = list(data_messy["start_station_id"].unique())
end_stations = list(data_messy["end_station_id"].unique())
print("There are", len(start_stations), "start stations, and", len(end_stations), "end stations")

Thus there are 2 more start stations than end stations. This might be because there are stations,
* in which you can only rent a bike; not return
* which have multiple IDs

Or because
* start station IDs and end station IDs are completely uncoupled and one of the reasons above

I was wondering how big the fleet is:

In [None]:
def getTime(str):
    return datetime.strptime(str, '%Y-%m-%d %H:%M:%S')

In [None]:
num_bikes = len(list(data_messy["bike_id"].unique()))

print("The fleet consists of", num_bikes, "bikes.")

What is the ratio of subscribed users as compared to all users in 2018?

In [None]:
num_subs = len(data_messy[data_messy["user_type"] == "Subscriber"])
num_tot = len(data_messy)

print(round((num_subs / num_tot * 100), 2), "% of users are actual subscribers")

Check for null values.

In [None]:
data_messy[data_messy["user_type"].isnull()]

Engineering a new feature "duration":

In [None]:
data_messy["duration"] = (pd.to_datetime(data_messy["end_time"]) - pd.to_datetime(data_messy["start_time"]))

In [None]:
data_messy.head()

In [None]:
data_messy["duration"].sort_values().head(10)

7 subsequent observations have a negative duration. Looking at the dates, it becomes obvious, that the underlying reason is the time change:

In [None]:
data_messy.loc[2946878, "start_time"]

In [None]:
data_messy.loc[2946878, "end_time"]

You can find information about the **negative** time change (summer to winter time on the 4th of November 2018: 2 PM to 1 PM) [here](https://www.timeanddate.de/stadt/zeitumstellung/usa/chicago).

It may be necessary to adjust values for the other **positive** time change (winter to summer on the 11th of May 2018: 2 PM to 3 PM).

How high is the maximum and minimum utilization of the fleet?

First I need to define a function which converts the string into a datetime format.

In [None]:
time_bike = np.zeros(6500)

for index, row in data_messy.iterrows():
    time_bike[row["bike_id"]] = time_bike[row["bike_id"]] + ((getTime(row["end_time"]) - getTime(row["start_time"])).total_seconds() / 3600)

In [None]:
time_bike_series = pd.DataFrame(time_bike)

In [None]:
time_bike_series.describe()

In [None]:
sns.boxplot(data = time_bike_series, palette = "magma")
plt.show()

Which bike was used the most?

In [None]:
time_bike_series[time_bike_series[0] >= 4000]

Average utilization: (The average utilization will be way higher as there are also alot of null values included in this aggregation)

In [None]:
av_ut = float((time_bike_series.mean() / (24 * 365))*100)

print("The average utilization is:", av_ut, "%")

Which stations are used the most?

In [None]:
sns.countplot(x = "start_station_id", data = data_messy)
plt.show()

# An attempt to visualize the most used stations geographically

First I import a dataset that I found on the [internet](https://data.cityofchicago.org/Transportation/Divvy-Bicycle-Stations-In-Service/67g3-8ig8) to describe the position (long, lat) of the stations.

In [None]:
data_stations = pd.read_csv("Divvy_Bicycle_Stations_-_In_Service.csv")

In [None]:
data_stations.head(10)

Joining (merging) the initial dataset with the dataset, which contains the coordinates for the stations on the column 'start_station_id', respectively 'ID'

In [None]:
merged_data = pd.merge(data_messy, data_stations, left_on = "start_station_id", right_on = "ID")

In [None]:
merged_data.head(3)

Adding a new column "Coordinates" because the location, lat' and long' variables cannot be simply plotted by folium.

In [None]:
merged_data["Coordinates"] = list(zip(merged_data["Latitude"].round(4), merged_data["Longitude"].round(4)))

In [None]:
merged_data.head(3)

## Importing the relevant libraries:

In [None]:
from pandas.plotting import register_matplotlib_converters
sns.set()
sns.set_style("white")
sns.set_palette("GnBu_d")

import folium
from folium import plugins
from folium.plugins import HeatMap
from datetime import datetime #for working with times objects
from datetime import timedelta #for working with times objects
import math
import random
import timeit

# First visualization attempts

I will select a random bike for showing the locations to avoid overplotting

In [None]:
list_bikes = data_messy["bike_id"].unique()
random_bike = random.choice(list_bikes)
print(random_bike)

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

heat_map.add_child(plugins.HeatMap(merged_data[merged_data["bike_id"] == random_bike]["Coordinates"], radius = 15))

heat_map

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

heat_map.add_child(plugins.HeatMap(merged_data[merged_data["bike_id"] == random_bike]["Coordinates"], radius = 15))

positions = list(merged_data["Coordinates"].unique())
for position in positions:
    folium.RegularPolygonMarker(radius=1, location=position, popup='The Waterfront', 
                                 color='crimson', fill_color='crimson').add_to(heat_map)

heat_map

Plotting the density of used bikestations for bikes with an id lower than 500 (because plotting this for all bikes would just not terminate)

In [None]:
heat_map

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

heat_map.add_child(plugins.HeatMap(merged_data[merged_data["bike_id"] < 1000]["Coordinates"], radius = 15))

heat_map

# Geographical Findings

## Communities with most Divvy Bikes traffic

![test](Chicago_community_areas_map.svg.png)

* all of Center

## Locations / POI with most traffic

In some zoom-scales (and the given radius) one can identify the following highly used stations:

* [Ogilvie Transportation Center](https://en.wikipedia.org/wiki/Ogilvie_Transportation_Center)
* [Chicago Union Station](https://en.wikipedia.org/wiki/Chicago_Union_Station)
* Jane Addams Memorial, see [googlemaps](https://www.google.de/maps/place/Divvy/@41.8922479,-87.6121172,88m/data=!3m1!1e3!4m13!1m7!3m6!1s0x880e2c3cd0f4cbed:0xafe0a6ad09c0c000!2sChicago,+IL,+USA!3b1!8m2!3d41.8781136!4d-87.6297982!3m4!1s0x880e2b53a71cd513:0xd04bc26796dcb054!8m2!3d41.892278!4d-87.612043) for position


# Attempt to take POI into consideration

The list of POI was found on [Wikipedia](https://en.wikipedia.org/wiki/List_of_Chicago_Landmarks) and can be downloaded under this [Link](https://tools.wmflabs.org/kmlexport?article=List_of_Chicago_Landmarks).

In [None]:
POI_data = pd.read_csv("convertcsvtest.csv")

In [None]:
POI_data.head()

In [None]:
POI_data.info()

In [None]:
POI_data.head()

In [None]:
POI_data["Coordinates"] = list(zip(POI_data["long"].round(4), POI_data["lat"].round(4)))

In [None]:
POI_data.count()

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

heat_map.add_child(plugins.HeatMap(merged_data[merged_data["bike_id"] < 1000]["Coordinates"], radius = 15))

positions = list(POI_data["Coordinates"])
for position in positions:
    folium.CircleMarker(radius=1, location=position, popup='The Waterfront', 
                                 color='crimson', fill_color='crimson').add_to(heat_map)

heat_map

## Finding out which stations are used the most.

In [None]:
data_stations.head()

In [None]:
data_stations.info()

In [None]:
data_stations.loc[2, "ID"]

In [None]:
for i in range(693):
    data_stations.loc[i, "Usage"] = len(merged_data[merged_data["start_station_id"] == data_stations.loc[i, "ID"]])

In [None]:
data_stations["Coordinates"] = list(zip(data_stations["Latitude"].round(4), data_stations["Longitude"].round(4)))

In [None]:
data_stations.head()

In [None]:
data_stations["Usage"].describe()

In [None]:
sns.displot(data = data_stations["Usage"] , kde=True)

plt.show()

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

positions = list(data_stations["Coordinates"].unique())
for position in positions:
    folium.CircleMarker(radius=1, location=position, popup='The Waterfront', 
                                 color='crimson', fill_color='crimson',
                               size = data_stations["Usage"]).add_to(heat_map)

heat_map

## Which stations are most popular?

In [None]:
len(data_messy["start_station_id"].unique())

In [None]:
topTwy = data_stations[data_stations["ID"] <= 621].sort_values("Usage").tail(100)
lowTwy = data_stations[data_stations["ID"] <= 621].sort_values("Usage").head(100)

In [None]:
topTwy["Popular"] = "crimson"
lowTwy["Popular"] = "blue"

topLowTwy = [topTwy, lowTwy]
topLowTwy = pd.concat(topLowTwy)
topLowTwy

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

for i in range(len(topLowTwy)):
    folium.CircleMarker(radius=1, location = topLowTwy.iloc[i]["Coordinates"], 
                                 color=topLowTwy.iloc[i]["Popular"], fill_color='crimson').add_to(heat_map)

heat_map

In [None]:
data_stations = data_stations[data_stations["ID"] < 621].sort_values("Usage")

g1 = data_stations[data_stations["Usage"] < 1000]
g2 = data_stations[(data_stations["Usage"] >= 1000) & (data_stations["Usage"] < 10000)]
g3 = data_stations[data_stations["Usage"] >= 10000]
                   
g1["Color"] = "green"
g2["Color"] = "yellow"
g3["Color"] = "red"

stat = [g1, g2, g3]
stat = pd.concat(stat)
stat

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

for i in range(len(stat)):
    folium.CircleMarker(radius=2, location = stat.iloc[i]["Coordinates"], 
                                 color=stat.iloc[i]["Color"], fill_color='crimson',
                                   popup = stat.iloc[i]["Usage"]).add_to(heat_map)

heat_map

In [None]:
data_messy[data_messy["start_station_id"] == 542].sort_values("start_time")

In [None]:
stat[stat["ID"] == 542]["Usage"]

### Sights:

* University of Chicago (in an area of unpopular stations)

Todo:

* find out why the some stations are so unpopular

### Finding out when new stations were established

In [None]:
data_messy[data_messy["start_station_id"] == 345]["start_time"].min()

In [None]:
def getFirstDate(ID):
    return getTime(data_messy[data_messy["start_station_id"] == ID]["start_time"].min())

In [None]:
getFirstDate(562)

In [None]:
data_stations["First Use"] = data_stations["ID"].apply(lambda x: getFirstDate(x))

In [None]:
data_stations.head()

In [None]:
data_stations.iloc[34]["First Use"] < cutDate1

In [None]:
def categorize(x, a, b):
    if data_stations.iloc[x]["First Use"] < a:
        return "green"
    elif ((data_stations.iloc[x]["First Use"] >= a) & (data_stations.iloc[x]["First Use"] < b)):
        return "yellow"
    elif data_stations.iloc[x]["First Use"] > b:
        return "red"

In [None]:
cutDate1 = pd.to_datetime("1st of February, 2018")
cutDate2 = pd.to_datetime("1st of May, 2018")

In [None]:
cutDate1 > cutDate2

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=9, control_scale=True, max_zoom=20)

for i in range(len(data_stations)):
    folium.CircleMarker(radius=2, location = data_stations.iloc[i]["Coordinates"], 
                                 color=categorize(i, cutDate1, cutDate2), fill_color='crimson',
                                   popup = data_stations.iloc[i]["ID"]).add_to(heat_map)

heat_map

## Considering auto traffic

Again some data I found on the [internet](https://www.chicago.gov/city/en/depts/cdot/dataset/average_daily_trafficcounts.html)

### Average Daily Traffic

Average Daily Traffic refers to the number of vehicles traveling through a particular point on the city streets in a 24-hour period. Average Daily Traffic (ADT) counts are analogous to a census count of vehicles on city streets. These counts provide a close approximation to the actual number of vehicles passing through a given location on an average weekday.

In [None]:
traffic = pd.read_csv("Average_Daily_Traffic_Counts.csv", sep = ",")

In [None]:
traffic["Coordinates"] = list(zip(traffic["Latitude"].round(4), traffic["Longitude"].round(4)))

In [None]:
traffic.head()

In [None]:
heat_map = folium.Map(location=(41.8695, -87.6555), tiles='OpenStreetMap', zoom_start=15, control_scale=True, max_zoom=20)

# heat_map.add_child(plugins.HeatMap(merged_data[merged_data["bike_id"] < 1000]["Coordinates"], radius = 25))

for i in range(0,len(traffic)):
   folium.Circle(
      location=[traffic.iloc[i]['Latitude'], traffic.iloc[i]['Longitude']],
      #popup=traffic.iloc[i]['Traffic Volume Count Location Address'],
      radius=float(traffic.iloc[i]['Total Passing Vehicle Volume'])*0.005,
      color='crimson',
      fill=True,
      fill_color='crimson'
   ).add_to(heat_map)

heat_map