In [74]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from haversine import haversine
import plotly.express as px
from keplergl import KeplerGl
import pandas as pd
import numpy as np
import swifter
from tqdm.auto import tqdm
import geopandas as gpd
# see all columns
pd.set_option('display.max_columns', None)
# set seaborn style
sns.set_style('white')
# set context to notebook
sns.set_context('notebook')

In [75]:
# load in the weather data from pickle
weather = pd.read_csv('../data/processed/2017_2018_municipality_weather.csv', parse_dates=['calculatedAt','from', 'to'])
weather.head()

Unnamed: 0,longitude,latitude,calculatedAt,from,to,municipalityName,parameterId,value
0,12.334461,55.622375,2019-02-16T05:04:43.709000,2017-07-01T23:00:00+00:00,2017-07-02 00:00:00+00:00,Ishøj,mean_wind_dir,281.0
1,12.368402,55.727751,2017-07-02T00:48:11.230000,2017-07-01T23:00:00+00:00,2017-07-02 00:00:00+00:00,Ballerup,max_wind_speed_10min,2.9
2,12.245738,55.585514,2017-09-26T12:43:00.311000,2017-07-01T23:00:00+00:00,2017-07-02 00:00:00+00:00,Greve,mean_temp,12.0
3,12.650228,55.593807,2017-07-02T00:48:58.307000,2017-07-01T23:00:00+00:00,2017-07-02 00:00:00+00:00,Dragør,mean_cloud_cover,0.0
4,12.072539,55.457299,2017-09-26T12:43:00.407000,2017-07-01T23:00:00+00:00,2017-07-02 00:00:00+00:00,Køge,mean_temp,12.5


| Parameter ID | Description |
| --- | --- |
| mean_wind_dir | Mean wind direction in degrees |
| max_wind_speed_10min | Maximum wind speed (10 minutes average) in m/s |
| mean_temp | Mean temperature in °C |
| mean_cloud_cover | Fraction of the sky covered by clouds of any type or height above the ground in % |
| max_temp_w_date | Maximum temperature with associated date in °C |
| bright_sunshine | Minutes/hours of bright sunshine |
| mean_relative_hum | Mean relative humidity in % |
| temp_grass | Air temperature measured at grass height (5-20 cm over terrain) in °C |
| mean_wind_speed | Mean wind speed in m/s |
| acc_precip | Accumulated precipitation in mm |
| min_temp | Minimum temperature in °C |
| no_summer_days | Number of summer days (maximum temperature > 25°C) |
| no_ice_days | Number of ice days (maximum temperature < 0°C) |
| acc_heating_degree_days_17 | Accumulated heating degree days (17°C - mean_temp) |
| no_frost_days | Number of days with frost (minimum temperature < 0°C) |
| max_precip_30m | Maximum 30 minutes intensity in 24 hours with date in mm |
| no_days_acc_precip_10 | Number of days with accumulated precipitation >= 10mm |


In [76]:
# drop calculatedAt column
weather.drop(columns='calculatedAt', inplace=True)

In [77]:
# save first 200000 rows to csv as weather_short.csv
weather[:200000].to_csv('../data/processed/weather_short.csv', index=False)

In [78]:
# count values inside parameterId column
weather.parameterId.value_counts()

parameterId
mean_wind_dir                 267274
mean_relative_hum             267274
max_wind_speed_10min          267274
acc_precip                    267274
mean_wind_speed               267274
temp_grass                    267274
min_temp                      267274
max_temp_w_date               267274
mean_cloud_cover              267274
mean_temp                     267274
bright_sunshine               267240
acc_heating_degree_days_17     11050
no_frost_days                  11050
max_precip_30m                 11050
no_summer_days                 10880
no_ice_days                    10880
snow_depth                      7446
no_days_acc_precip_10           4760
no_tropical_nights              4658
mean_daily_min_temp             4658
mean_daily_max_temp             4658
Name: count, dtype: int64

In [79]:
# Re-import the required pandas library after environment reset
import pandas as pd

# Convert 'from' column to datetime
weather['from'] = pd.to_datetime(weather['from'], errors='coerce', infer_datetime_format=True)

# Pivot the table again without 'calculatedAt'
weather_wide = weather.pivot_table(index=['longitude', 'latitude', 'from', 'to', 'municipalityName'],
                                   columns='parameterId',
                                   values='value',
                                   aggfunc='first').reset_index()

# Now, we will fill NaN values for each parameter for a specific 'from' and 'municipalityName'
# We group by 'from' and 'municipalityName' and then apply a forward-fill followed by backward-fill within each group

# Define a function to apply forward-fill followed by backward-fill
def fill_missing_values(group):
    return group.ffill().bfill()

# Group by 'from' and 'municipalityName' and apply the fill function
weather_filled = weather_wide.groupby(['from', 'municipalityName'], sort=False).apply(fill_missing_values).reset_index(drop=True)

# Check the result after filling
weather_filled.head(30)



The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.



parameterId,longitude,latitude,from,to,municipalityName,acc_precip,bright_sunshine,max_precip_30m,max_temp_w_date,max_wind_speed_10min,mean_cloud_cover,mean_relative_hum,mean_temp,mean_wind_dir,mean_wind_speed,min_temp,snow_depth,temp_grass
0,11.815109,55.469094,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Ringsted,0.8,0.0,,13.8,2.4,95.0,98.8,13.6,25.0,1.9,13.4,,13.6
1,11.815109,55.469094,2017-07-01 01:00:00+00:00,2017-07-01 02:00:00+00:00,Ringsted,0.8,0.0,,13.8,3.0,91.0,98.8,13.5,13.0,2.3,13.3,,13.5
2,11.815109,55.469094,2017-07-01 02:00:00+00:00,2017-07-01 03:00:00+00:00,Ringsted,0.2,0.0,,13.7,3.4,99.0,99.1,13.4,6.0,2.7,13.2,,13.4
3,11.815109,55.469094,2017-07-01 03:00:00+00:00,2017-07-01 04:00:00+00:00,Ringsted,0.2,0.0,,13.6,3.7,98.0,98.9,13.4,5.0,2.9,13.1,,13.5
4,11.815109,55.469094,2017-07-01 04:00:00+00:00,2017-07-01 05:00:00+00:00,Ringsted,0.3,0.0,,13.8,4.1,98.0,98.5,13.5,8.0,3.3,13.2,,13.5
5,11.815109,55.469094,2017-07-01 05:00:00+00:00,2017-07-01 06:00:00+00:00,Ringsted,0.2,0.1,,14.1,4.1,93.0,97.9,13.6,10.0,3.3,13.3,,13.7
6,11.815109,55.469094,2017-07-01 06:00:00+00:00,2017-07-01 07:00:00+00:00,Ringsted,0.0,0.2,,14.4,4.2,98.0,96.3,13.9,7.0,3.5,13.6,,14.1
7,11.815109,55.469094,2017-07-01 07:00:00+00:00,2017-07-01 08:00:00+00:00,Ringsted,0.0,0.4,,14.8,5.0,95.0,94.7,14.2,4.0,3.8,13.8,,14.5
8,11.815109,55.469094,2017-07-01 08:00:00+00:00,2017-07-01 09:00:00+00:00,Ringsted,0.0,0.1,,15.1,4.6,95.0,93.1,14.4,5.0,3.9,13.9,,14.8
9,11.815109,55.469094,2017-07-01 09:00:00+00:00,2017-07-01 10:00:00+00:00,Ringsted,0.0,0.1,,15.5,5.2,96.0,91.0,14.8,5.0,4.0,14.1,,15.3


In [80]:
# sort values by from column and municipalityName column and reset index
weather_filled.sort_values(by=['from', 'municipalityName'], inplace=True)
weather_filled.reset_index(drop=True, inplace=True)
weather_filled.head()

parameterId,longitude,latitude,from,to,municipalityName,acc_precip,bright_sunshine,max_precip_30m,max_temp_w_date,max_wind_speed_10min,mean_cloud_cover,mean_relative_hum,mean_temp,mean_wind_dir,mean_wind_speed,min_temp,snow_depth,temp_grass
0,12.352321,55.684977,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Albertslund,0.6,0.0,,13.8,2.4,93.0,98.7,13.6,15.0,2.0,13.4,,13.5
1,12.315178,55.851931,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Allerød,0.2,0.0,,13.6,1.6,99.0,99.3,13.4,8.0,1.0,13.3,,13.2
2,12.368402,55.727751,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Ballerup,0.3,0.0,,13.8,2.0,94.0,99.2,13.5,13.0,1.6,13.3,,13.4
3,12.404382,55.645037,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Brøndby,0.9,0.0,,14.0,3.3,93.0,98.8,13.7,14.0,2.2,13.5,,13.6
4,12.650228,55.593807,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Dragør,2.1,0.0,,14.1,4.1,100.0,96.9,13.8,11.0,3.7,13.7,,13.7


In [81]:
weather_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 274720 entries, 0 to 274719
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   longitude             274720 non-null  float64
 1   latitude              274720 non-null  float64
 2   from                  274720 non-null  object 
 3   to                    274720 non-null  object 
 4   municipalityName      274720 non-null  object 
 5   acc_precip            263670 non-null  float64
 6   bright_sunshine       263636 non-null  float64
 7   max_precip_30m        11050 non-null   float64
 8   max_temp_w_date       263670 non-null  float64
 9   max_wind_speed_10min  263670 non-null  float64
 10  mean_cloud_cover      263670 non-null  float64
 11  mean_relative_hum     263670 non-null  float64
 12  mean_temp             263670 non-null  float64
 13  mean_wind_dir         263670 non-null  float64
 14  mean_wind_speed       263670 non-null  float64
 15  

In [None]:
# save to csv as weather_compact.csv
weather_filled.to_csv('../data/processed/weather_compact.csv', index=False)

In [69]:
# drop the rows with nan values for acc_precip
#weather_filled.dropna(subset=['acc_precip'], inplace=True)
#weather_filled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263670 entries, 0 to 274719
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   longitude             263670 non-null  float64
 1   latitude              263670 non-null  float64
 2   from                  263670 non-null  object 
 3   to                    263670 non-null  object 
 4   municipalityName      263670 non-null  object 
 5   acc_precip            263670 non-null  float64
 6   bright_sunshine       263636 non-null  float64
 7   max_precip_30m        0 non-null       float64
 8   max_temp_w_date       263670 non-null  float64
 9   max_wind_speed_10min  263670 non-null  float64
 10  mean_cloud_cover      263670 non-null  float64
 11  mean_relative_hum     263670 non-null  float64
 12  mean_temp             263670 non-null  float64
 13  mean_wind_dir         263670 non-null  float64
 14  mean_wind_speed       263670 non-null  float64
 15  min_t

In [None]:
'''# Before pivoting, let's ensure that 'from' column is of datetime type for proper sorting.
weather['from'] = pd.to_datetime(weather['from'], errors='coerce')

# Now let's pivot the table
weather_wide = weather.pivot_table(index=['longitude', 'latitude','from', 'to', 'municipalityName'],
                                        columns='parameterId',
                                        values='value',
                                        aggfunc='first', ).reset_index()

# Fill NaN values with 0
#weather_wide = weather_wide.fillna(0)

# Let's also check the result to ensure it's transformed as expected
weather_wide.head()'''


parameterId,longitude,latitude,from,to,municipalityName,acc_precip,bright_sunshine,max_precip_30m,max_temp_w_date,max_wind_speed_10min,mean_cloud_cover,mean_relative_hum,mean_temp,mean_wind_dir,mean_wind_speed,min_temp,snow_depth,temp_grass
0,11.815109,55.469094,2017-07-01 00:00:00+00:00,2017-07-01 01:00:00+00:00,Ringsted,0.8,0.0,,13.8,2.4,95.0,98.8,13.6,25.0,1.9,13.4,,13.6
1,11.815109,55.469094,2017-07-01 01:00:00+00:00,2017-07-01 02:00:00+00:00,Ringsted,0.8,0.0,,13.8,3.0,91.0,98.8,13.5,13.0,2.3,13.3,,13.5
2,11.815109,55.469094,2017-07-01 02:00:00+00:00,2017-07-01 03:00:00+00:00,Ringsted,0.2,0.0,,13.7,3.4,99.0,99.1,13.4,6.0,2.7,13.2,,13.4
3,11.815109,55.469094,2017-07-01 03:00:00+00:00,2017-07-01 04:00:00+00:00,Ringsted,0.2,0.0,,13.6,3.7,98.0,98.9,13.4,5.0,2.9,13.1,,13.5
4,11.815109,55.469094,2017-07-01 04:00:00+00:00,2017-07-01 05:00:00+00:00,Ringsted,0.3,0.0,,13.8,4.1,98.0,98.5,13.5,8.0,3.3,13.2,,13.5


In [None]:
# for each row of the dataframe, populate the nan value with the value recorded at the same time in the previous or next rows. most important is to be the same time 
#weather_wide = weather_wide.sort_values(by= 'from').reset_index(drop=True)
#weather_wide = weather_wide.fillna(method='ffill')

In [None]:
# sort data by 'from' column
#weather_wide.sort_values(by='from', inplace=True)

In [None]:
#weather_wide.info()

<class 'pandas.core.frame.DataFrame'>
Index: 274720 entries, 0 to 274719
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   longitude             274720 non-null  float64
 1   latitude              274720 non-null  float64
 2   from                  274720 non-null  object 
 3   to                    274720 non-null  object 
 4   municipalityName      274720 non-null  object 
 5   acc_precip            256224 non-null  float64
 6   bright_sunshine       256190 non-null  float64
 7   max_precip_30m        11050 non-null   float64
 8   max_temp_w_date       256224 non-null  float64
 9   max_wind_speed_10min  256224 non-null  float64
 10  mean_cloud_cover      256224 non-null  float64
 11  mean_relative_hum     256224 non-null  float64
 12  mean_temp             256224 non-null  float64
 13  mean_wind_dir         256224 non-null  float64
 14  mean_wind_speed       256224 non-null  float64
 15  min_t

In [71]:
# from and to coluns are of this format 2017-07-01 00:00:00+00:00
# we need to remove the time zone information
# we will use the apply function to remove the time zone information
weather_wide['from'] = weather_wide['from'].apply(lambda x: x.replace(tzinfo=None))

In [72]:
# find out how namy days are in the dataset. the from is a timespamo, so we can use the dt.date function to extract the date
weather_wide['from'].dt.date.nunique()

314

In [73]:
# find teh first and last date in the dataset
weather_wide['from'].min(), weather_wide['from'].max()

(Timestamp('2017-07-01 00:00:00'), Timestamp('2018-05-10 23:31:00'))

In [None]:
# load gm dataset
gm = pd.read_csv('../data/processed/gm_raw.csv', parse_dates=['tripStart','tripEnd'])

In [None]:
# create new column "tripStarthour" and round up to nearest hour
gm['tripStarthour'] = gm['tripStart'].dt.round('H')

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Trim the data until July 10th
trimmed_weather = weather_wide[weather_wide['from'] <= '2017-07-10']
trimmed_gm = gm[gm['tripStarthour'] <= '2017-07-10']

# Plot Accumulated Precipitation
fig.add_trace(go.Scatter(x=trimmed_weather['from'], y=trimmed_weather['acc_precip'],
                         mode='lines',
                         marker=dict(symbol='circle'),
                         name='Accumulated Precipitation',
                         ))

# Plot Count of Trips
fig.add_trace(go.Scatter(x=trimmed_gm['tripStarthour'].value_counts().index, y=trimmed_gm['tripStarthour'].value_counts().values,
                         mode='markers',
                         name='Count of Trips',
                         yaxis='y2'))

fig.update_layout(title='Accumulated Precipitation and Count of Trips', width=1500, height=800,
                  xaxis=dict(title='Date'),
                  yaxis=dict(title='Accumulated Precipitation'),
                  yaxis2=dict(title='Count of Trips', overlaying='y', side='right'))

fig.show()



TypeError: '<=' not supported between instances of 'Timestamp' and 'str'

In [None]:
# use plotly to plot the acc_precip against from column
#fig = px.line(weather_wide, x='from', y='acc_precip', title='Accumulated Precipitation')
#fig.show()

In [None]:
weather_wide.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2836926 entries, 0 to 2836925
Data columns (total 19 columns):
 #   Column                Dtype  
---  ------                -----  
 0   longitude             float64
 1   latitude              float64
 2   calculatedAt          object 
 3   from                  object 
 4   to                    object 
 5   municipalityName      object 
 6   acc_precip            float64
 7   bright_sunshine       float64
 8   max_precip_30m        float64
 9   max_temp_w_date       float64
 10  max_wind_speed_10min  float64
 11  mean_cloud_cover      float64
 12  mean_relative_hum     float64
 13  mean_temp             float64
 14  mean_wind_dir         float64
 15  mean_wind_speed       float64
 16  min_temp              float64
 17  snow_depth            float64
 18  temp_grass            float64
dtypes: float64(15), object(4)
memory usage: 411.2+ MB


In [None]:
weather.municipalityName.value_counts()

municipalityName
Ishøj             88855
Frederiksberg     88855
Hørsholm          88855
Egedal            88855
Gentofte          88855
Allerød           88855
Helsingør         88855
Herlev            88855
Albertslund       88855
Ballerup          88855
Lejre             88855
Rudersdal         88855
Hillerød          88855
Rødovre           88855
Brøndby           88855
Frederikssund     88855
Ringsted          88855
Roskilde          88855
Halsnæs           88855
Høje-Taastrup     88855
Fredensborg       88855
Solrød            88855
København         88855
Gribskov          88855
Gladsaxe          88855
Lyngby-Taarbæk    88855
Hvidovre          88855
Glostrup          88855
Vallensbæk        88855
Furesø            88855
Køge              88855
Dragør            88855
Greve             88855
Tårnby            88855
Name: count, dtype: int64

In [None]:
# read in the shape file for the municipalities
municipalities = gpd.read_file('../data/processed/cop_area.shp')

In [None]:
map_1 = KeplerGl(height = 800)
map_1

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(height=800)

In [None]:
#map_1.add_data(data=weather.head(40000), name='weather')
#map_1.add_data(data=municipalities, name='municipalities')


In [None]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021070 entries, 0 to 3021069
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   longitude         float64
 1   latitude          float64
 2   calculatedAt      object 
 3   from              object 
 4   to                object 
 5   municipalityName  object 
 6   parameterId       object 
 7   value             float64
dtypes: float64(3), object(5)
memory usage: 184.4+ MB


In [None]:
# ispect the te