In [7]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime as dt
import geopandas as gpd
import json
import seaborn as sns
from shapely.geometry import Point, Polygon
from pyproj import CRS
import plotly.graph_objects as go


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

ModuleNotFoundError: No module named 'geopandas'

In [None]:
# import data

# bike rentals (https://data.london.gov.uk/dataset/number-bicycle-hires)
# metadata = pd.read_excel(Path.cwd().parent/"files"/"tfl-daily-cycle-hires.xlsx")
bike_rentals = pd.read_excel(Path.cwd().parent/"Assignment3"/"files"/"bike_rentals_london.xlsx", sheet_name = 'Data')

# covid (https://data.london.gov.uk/dataset/coronavirus--covid-19--cases)
covid = pd.read_csv(Path.cwd().parent/"Assignment3"/"files"/"corona_london.csv")

# weather (https://data.ceda.ac.uk/badc/ukmo-midas-open/data/uk-hourly-weather-obs/dataset-version-202107/greater-london/00697_london-st-jamess-park/qc-version-1)
# metadata od the weather data (https://artefacts.ceda.ac.uk/badc_datadocs/ukmo-midas/WH_Table.html)
li = []

for i in range(2010,2020,1):

    dummy = pd.read_csv(Path.cwd().parent/"Assignment3"/"files"/f"London_weather_{i}_heathrow.csv", header = 280)
    li.append(dummy)

weather = pd.concat(li, axis=0, ignore_index=True)

del li, dummy, i


In [None]:
# Preprocessing bike rental data

bike_rentals_month = bike_rentals[['Month', 'Number of Bicycle Hires.1', 'Average Hire Time (mins)']]
# drop NaN rows
bike_rentals_month = bike_rentals_month[bike_rentals_month['Number of Bicycle Hires.1'].notna()]

bike_rentals_year = bike_rentals[['Year', 'Number of Bicycle Hires.2']]
bike_rentals_year = bike_rentals_year.iloc[:bike_rentals_year.Year.isnull().values.argmax()]

bike_rentals = bike_rentals.drop(columns = ['Unnamed: 2', 'Unnamed: 5', 'Unnamed: 8', 'Month.1', 'Month', 'Number of Bicycle Hires.1', \
    'Average Hire Time (mins)', 'Year', 'Number of Bicycle Hires.2'])

bike_rentals.head()

In [None]:
# Preprocessing weather data

# drop columns with only zeros
weather = weather.dropna(axis=1, how='all')
# drop rows with only zeros
weather = weather.dropna(axis=0, how='all')
# drop columns version_num as all values = 1.0, id as all values = 3770 or NaN, met_domain_name as all values = SYNOP,
# id_type = WMO, 'src_id' = 697
weather = weather.drop(columns = ['version_num', 'id', 'met_domain_name', 'id_type', 'src_id'])
# drop columns wind_direction, wind_speed, wind_direction_q, wind_speed_q only two values in these columns
weather = weather.drop(columns = ['wind_direction', 'wind_speed', 'wind_direction_q', 'wind_speed_q'])
# drop column wind_speed_unit_id as all the wind columns consist of mostly NaNs
weather = weather.drop(columns = ['wind_speed_unit_id'])
# drop columns as the method and manner of measurement are not relevant for this analysis
weather = weather.drop(columns = ['rec_st_ind', 'src_opr_type', 'air_temperature_q', 'dewpoint_q', 'wetb_temp_q', 'dewpoint_j', \
    'wetb_temp_j', 'rltv_hum_j', 'midas_stmp_etime', 'meto_stmp_time'])
# drop columns where air_temperature = NaN
weather = weather[weather['air_temperature'].notna()]
# converting date to datetime
weather['ob_time'] = pd.to_datetime(weather['ob_time'])
# group by day to be able to merge it with the bike rental data
weather = weather.groupby(weather['ob_time'].dt.date).mean().reset_index()
weather['ob_time'] = pd.to_datetime(weather['ob_time'])

## optional
# drop columns where dewpoint, wetb_temp, rltv_hum = NaN -> needed if used for analysis, but if not rather more temperature measurements
weather = weather[weather['dewpoint'].notna()]

## checks
# NaN check 
weather.isna().sum()

In [None]:
# Preprocessing covid data

# drop all dates without age information
covid = covid[covid.age_band != 'unassigned']
# drop columns area_name, area_code
covid = covid.drop(columns = ['area_name', 'area_code'])
# converting date to datetime
covid['date'] = pd.to_datetime(covid['date'])

In [None]:
# Preprocessing combing bike rentals and weather data by joining on the datetime

bike_rentals_weather = pd.merge(bike_rentals, weather, left_on='Day', right_on='ob_time')

In [None]:
# Preprocessing combing bike rentals and covid data by joining on the datetime

bike_rentals_covid = pd.merge(bike_rentals, covid, left_on='Day', right_on='date')

In [None]:
bike_rentals_weather

In [None]:
bins= [0.0,15.0,30.0,60.0,91.0]
labels = ['Child','Young Adult','Middle-Age Adult','Senior-Citizen']
bike_rentals_covid['AgeGroup'] = pd.cut(bike_rentals_covid['age_lower'], bins=bins, labels=labels, right=False)

In [None]:
bike_rentals_covid = bike_rentals_covid.drop(["age_band"], axis=1)

In [None]:
bike_rentals_covid = bike_rentals_covid.drop(["Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17"], axis=1)

In [None]:
bike_rentals_covid

In [None]:
bike_rentals_covid.boxplot('cases','AgeGroup')

In [None]:
bike_rentals_covid['year']=bike_rentals_covid.Day.dt.year
bike_rentals_covid['pickup_date']=bike_rentals_covid.Day.dt.date
bike_rentals_covid['pickup_day']=bike_rentals_covid.Day.dt.day
bike_rentals_covid['pickup_month']=bike_rentals_covid.Day.dt.month

# Extracting the day of the week (dow)
bike_rentals_covid['pickup_dow'] = pd.to_datetime(bike_rentals_covid['Day']).dt.dayofweek

# Type of day column

bike_rentals_covid['pickup_weekend'] = bike_rentals_covid.Day.dt.dayofweek.apply(lambda x: '0' if x <5 else '1')

In [None]:
# Adding the season
bike_rentals_covid['season']=bike_rentals_covid.Day.dt.month.apply(lambda x: 'Winter' if x <=2 else 'Spring' if x<=5 else 'Summer' 
                                             if x<=8 else 'Autumn' if x<=11 else 'Winter')

In [None]:
bike_rentals_covid = bike_rentals_covid.drop(["Day","date"], axis=1)

In [None]:
figure = plt.figure(figsize =(15, 6))
bikes_month=[]
for i in range(1,13):
    bikes_month.append(bike_rentals_covid[bike_rentals_covid['pickup_month']==i]['Number of Bicycle Hires'])
# Creating plot
plt.boxplot(bikes_month, showmeans=True, meanline= True)
plt.plot([], [], '--', linewidth=1, color='Green', label='mean')
plt.plot([], [], '-', linewidth=1, color='orange', label='median')
plt.xlabel("Months", fontsize=12)
plt.ylabel("Hires per month", fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.legend(fontsize=12)
plt.title("Boxplot for the distribution of hires per month during corona time")
plt.show()

In [None]:
covid_hires_per_season=bike_rentals_covid.groupby(['pickup_dow','season']).mean()['Number of Bicycle Hires'].reset_index()

In [None]:
covid_hires_per_season

In [None]:
covid_hires_per_dow=bike_rentals_covid.groupby(['pickup_dow','pickup_weekend']).mean()['Number of Bicycle Hires'].reset_index()
covid_hires_per_dow

In [None]:
#Plotting pickups based on the type of the day and the season
 
fig,axes = plt.subplots(nrows=1, ncols=2,figsize=(15,6))
#ax = plt.gca()
for weekday in covid_hires_per_dow['pickup_weekend'].unique():
    covid_hires_per_dow[covid_hires_per_dow.pickup_weekend==weekday].plot(kind='line',x='pickup_dow',y='Number of Bicycle Hires',ax=axes[0],xlabel='Days of the week',ylabel='Hires')
axes[0].legend(['Weekday','Weekend'])
axes[0].set_title('Hires per type of the day')



for season in covid_hires_per_season['season'].unique():
    covid_hires_per_season[covid_hires_per_season.season==season].plot(kind='line',x='pickup_dow',y='Number of Bicycle Hires',ax=axes[1],xlabel='Days of the week',ylabel='Hires')
axes[1].legend(covid_hires_per_season['season'].unique(),loc='center left', bbox_to_anchor=(1, 0.5))
axes[1].set_title('Hires per season')

In [None]:
covid_cases_per_agegroup=bike_rentals_covid.groupby(['pickup_dow','AgeGroup']).mean()['cases'].reset_index()
covid_cases_per_agegroup

In [None]:
ax = plt.gca()
for agegroup in covid_cases_per_agegroup['AgeGroup'].unique():
    covid_cases_per_agegroup[covid_cases_per_agegroup.AgeGroup==agegroup].plot(kind='line',x='pickup_dow',y='cases',ax=ax,xlabel='Age group',ylabel='Covid-19 cases',figsize=(6.5,6))
plt.legend(covid_cases_per_agegroup['AgeGroup'].unique(),loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Covid-19 cases per day of the week per age group')

In [None]:
corrMatrix = bike_rentals_covid.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
cycling_load = json.load(open(Path.cwd().parent/"Assignment3"/"files"/"cycling-load.json"))

# insert loop here to download and concate data
tfl = pd.read_csv(cycling_load['entries'][1]['url'])

In [None]:
tfl = pd.DataFrame(tfl)
tfl

In [None]:
bikepoint = json.load(open(Path.cwd().parent/"Assignment3"/"files"/"bikepoint.json"))

In [None]:
bikepoint = pd.DataFrame(bikepoint)
bikepoint = bikepoint.drop(['$type', 'url', 'placeType', 'additionalProperties', 'children', 'childrenUrls'], axis='columns')
bikepoint

In [None]:
list_id = list(range(1,len(bikepoint)+1))
bikepoint['id'] = list_id
bikepoint

In [None]:
data = pd.merge(tfl, bikepoint, left_on="StartStation Id", right_on="id")
data = data.rename(columns={'id': 'id_start', 'lon' : 'lon_start', 'lat' : 'lat_start'})
data = pd.merge(data, bikepoint, left_on="EndStation Id", right_on="id")
data = data.rename(columns={'id': 'id_end', 'lon' : 'lon_end', 'lat' : 'lat_end'})
#data = data.dropna()
data.isnull().sum()

In [None]:
data.head()

In [None]:
# Import street map
street_map = gpd.read_file(Path.cwd().parent/"Assignment3"/"London-wards-2014_ESRI"/'London_Ward_CityMerged.shp')

# Creating new dataframe for storing relevant data for geo-visualization
data_location=pd.DataFrame()

In [None]:
#creating new column which contains combined latitude and longitude
data["location"]=list(zip(data["lat_start"], data["lon_start"]))

data_location["location"]=data["location"]
# Storing only unique location
data_location=data_location.apply(lambda col: col.drop_duplicates().reset_index(drop=True))
data_location=data_location.dropna()
# Caluclating how much each location is "visited" to get consumption
data_location["consumption"]=list(data["location"].value_counts())

In [None]:
# Making empty lists to store longitude and latitude to be used for geopandas library instance .points_from_xy
longitude=[]
latitude=[]
for x, y in data_location["location"]:
    longitude.append(x)
    latitude.append(y)

# Designate coordinate system
crs = CRS(init="epsg:4544")
geometry=gpd.points_from_xy(latitude,longitude)
# Create GeoPandas dataframe
geometry=geometry
geo_data = gpd.GeoDataFrame(data_location,
 crs = crs,
 geometry = geometry)

In [None]:
# Create figure and axes, assign to subplot
fig, ax = plt.subplots(figsize=(15,15))
# Add .shp mapfile to axes
street_map.plot(ax=ax, alpha=0.4,color='grey')
# Add geodataframe to axes
geo_data.plot(column='consumption',ax=ax,alpha=0.5, legend=True,markersize=10)
plt.title("Consumption of bike stations", fontsize=15,fontweight='bold')
# Set latitiude and longitude boundaries for map display
plt.xlim(-74.02,-73.9)
plt.ylim(40.64,40.84)
plt.show()