# Data Processing

The purpose of this notebook is to download data and process it. As a result we will get a dataset ready for further analyses and modeling.

# Table of Contents

[1. Imports & Environment Configuration](#Imports-&-Environment-Configuration)

# Imports & Environment Configuration

In [1]:
%load_ext autoreload

import json
import matplotlib.pyplot as plt
import os
import pandas as pd

if os.path.basename(os.getcwd()) == 'notebooks':
    # Make it possible to import modules
    import sys
    sys.path.append("../") 

from src.data.data_loading.bike_rental_data_downloader import BikeRentalDataDownloader
from src.data.data_loading.bike_rental_records import BikeRentalRecords
from src.data.data_loading.bike_station_locations import BikeStationsLocations

In [2]:
# let matplotlib plots be part of Jupyter Notebook
%matplotlib inline

# set up higher resolution for matplotlib plots
%config InlineBackend.figure_format = 'retina'

## Environment Configuration Variables

In [3]:
# Determine the root folder of the project. The current working directory 
# may vary depending on whether the notebook is run in Jupyter Notebook or VS Code
if os.path.basename(os.getcwd()) == 'notebooks':
    project_root_folder = os.path.abspath(os.path.join(os.getcwd(), '..'))   
else:
    project_root_folder = os.getcwd()
    
# Set up paths to data folders
data_raw_folder     = os.path.join(project_root_folder, 'data', 'raw')
data_interim_folder = os.path.join(project_root_folder, 'data', 'interim')

# Load data

## Bike rental records

### Configuration

In [4]:
# Provide config details for the Azure Storage container with data.
account_name='<storage_account_name>'
account_key='<storage_account_key>'
container_name='<storage_container_name>' 

# Alternatively, they can be grabbed from the 'local.settings.json' file 
# (used by Azure Functions), if it exists.
local_settings_file_path = os.path.join(project_root_folder, 
                                        'src', 'azurefunctions', 'local.settings.json')
if os.path.exists(local_settings_file_path):
    with open(local_settings_file_path, 'r') as f:
        local_settings = json.load(f)
        
    account_name = local_settings['Values']['storage_account_name']
    account_key = local_settings['Values']['storage_account_key']
    container_name = local_settings['Values']['storage_container_name']

### Load

In [5]:
# Download all data from Azure Blob Storage and save it locally 
blob_downloader = BikeRentalDataDownloader(account_name, account_key, container_name)
blob_downloader.download_blobs_and_save(data_raw_folder)

In [6]:
# Read all local csv files and combine it into one dataframe
brr_loader = BikeRentalRecords(data_raw_folder)
bike_rentals_df = brr_loader.load_data()

In [7]:
# just checking - to make sure data types are as we want
bike_rentals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 430531 entries, 76160681 to 88278425
Data columns (total 6 columns):
Bike number        430531 non-null int64
Rental datetime    430531 non-null datetime64[ns]
Return datetime    430531 non-null datetime64[ns]
Rental station     430531 non-null object
Return station     430531 non-null object
Duration           430531 non-null timedelta64[ns]
dtypes: datetime64[ns](2), int64(1), object(2), timedelta64[ns](1)
memory usage: 23.0+ MB


In [8]:
bike_rentals_df.sample(5)

Unnamed: 0_level_0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
84025572,650956,2019-09-03 07:32:50,2019-09-03 07:45:20,Pereca / Grabiszyńska,Kurkowa / Dubois,00:12:30
76288747,650933,2019-06-26 22:07:02,2019-06-26 22:13:41,Strzegomska / Estońska,Rogowska / Zemska,00:06:39
87578775,651017,2019-10-15 09:46:00,2019-10-15 10:14:00,Żernicka,Grabiszyńska / Aleja Hallera,00:28:00
78504310,650783,2019-07-15 20:49:00,2019-07-15 21:09:00,Tarnogajska / Klimasa,Powstańcow Śląskich / Aleja Hallera,00:20:00
85864555,650637,2019-09-22 14:30:00,2019-09-22 14:53:00,Borowska (Uniw. Szpital Kliniczny),Ślężna / pętla tramwajowa,00:23:00


In [9]:
# check if we have any missing data
bike_rentals_df.isna().sum()

Bike number        0
Rental datetime    0
Return datetime    0
Rental station     0
Return station     0
Duration           0
dtype: int64

## Bike station details

In [10]:
bike_stations_df = BikeStationsLocations().load_data()

In [11]:
bike_stations_df

Unnamed: 0,Bike station,Latitude,Longitude
0,Plac Dominikański (Galeria Dominikańska),51.108004,17.039528
1,"Dworzec Główny, południe",51.097108064432,17.036109566688538
2,Rynek,51.109782,17.030175
3,Dworzec Główny,51.09975,17.036228
4,Nowowiejska / Jedności Narodowej,51.124879,17.045844
...,...,...,...
198,pl. Orląt Lwowskich,51.10823,17.02138
199,Komandorska / Kamienna,51.09009,17.0235
200,"Wrocław Stadion, stacja kolejowa",51.13707,16.94095
201,"Wrocław Leśnica, stacja kolejowa",51.14323,16.86627


In [12]:
# check if we have any missing data
bike_stations_df.isna().sum()

Bike station    0
Latitude        0
Longitude       0
dtype: int64

## Join datasets

In [13]:
bike_rentals_df = pd.merge(bike_rentals_df, 
                           bike_stations_df, 
                           how='left',
                           left_on='Rental station',
                           right_on='Bike station')

In [14]:
# define more meaningful names for coordinates columns
cols = [f'Rental station {str.lower(column)}' 
            if column in ['Latitude', 'Longitude'] else column
                for column in bike_rentals_df.columns]
bike_rentals_df.columns = cols

# drop unnecessary
bike_rentals_df.drop(['Bike station'], axis=1, inplace=True)

In [15]:
bike_rentals_df.head()

Unnamed: 0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration,Rental station latitude,Rental station longitude
0,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09,51.07329,16.99485
1,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46,51.109782,17.030175
2,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17,,
3,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32,51.113871,17.034484
4,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00,51.099713,17.027905


In [16]:
bike_rentals_df = pd.merge(bike_rentals_df, 
                           bike_stations_df, 
                           how='left',
                           left_on='Return station',
                           right_on='Bike station')

In [17]:
# define more meaningful names for coordinates columns
cols = [f'Return station {str.lower(column)}' 
            if column in ['Latitude', 'Longitude'] else column
                for column in bike_rentals_df.columns]
bike_rentals_df.columns = cols

# drop unnecessary
bike_rentals_df.drop(['Bike station'], axis=1, inplace=True)

In [18]:
bike_rentals_df.head()

Unnamed: 0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration,Rental station latitude,Rental station longitude,Return station latitude,Return station longitude
0,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09,51.07329,16.99485,51.074992,17.007058
1,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46,51.109782,17.030175,51.104413,17.022536
2,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17,,,51.06577709842137,16.98857545852661
3,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32,51.113871,17.034484,51.125276,16.984447
4,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00,51.099713,17.027905,51.099713,17.027905


In [19]:
# Check if we have any missing data:
# - if rental station coordinates are missing -> bikes were rented outside official bike stations
# - if return station coordinates are missing -> bikes were returned outside official bike stations
bike_rentals_df.isna().sum()

Bike number                     0
Rental datetime                 0
Return datetime                 0
Rental station                  0
Return station                  0
Duration                        0
Rental station latitude     55813
Rental station longitude    55813
Return station latitude     53030
Return station longitude    53030
dtype: int64

## Save interim results

In [None]:
# write interim file
bike_rentals_records_interim_path = os.path.join(data_interim_folder, 'bike_rental_records.csv')
bike_rentals_df.to_csv(path_or_buf=bike_rentals_records_interim_path, index=False)