# Data Processing

The purpose of this notebook is to download data and process it. As a result we will get a dataset ready for further analyses and modeling.

# Table of Contents

[1. Imports & Environment Configuration](#Imports-&-Environment-Configuration)

# Imports & Environment Configuration

In [1]:
%load_ext autoreload

import glob
import json
import matplotlib.pyplot as plt
import os
import pandas as pd

if os.path.basename(os.getcwd()) == 'notebooks':
    # Make it possible to import modules
    import sys
    sys.path.append("../") 

from src.data.bike_stations_data_downloader import get_bike_stations_data
from src.data.blob_downloader import BlobDownloader

In [2]:
# let matplotlib plots be part of Jupyter Notebook
%matplotlib inline

# set up higher resolution for matplotlib plots
%config InlineBackend.figure_format = 'retina'

## Environment Configuration Variables

In [3]:
# Determine the root folder of the project. The current working directory 
# may vary depending on whether the notebook is run in Jupyter Notebook or VS Code
if os.path.basename(os.getcwd()) == 'notebooks':
    project_root_folder = os.path.abspath(os.path.join(os.getcwd(), '..'))   
else:
    project_root_folder = os.getcwd()
    
# Set up paths to data folders
data_raw_folder     = os.path.join(project_root_folder, 'data', 'raw')
data_interim_folder = os.path.join(project_root_folder, 'data', 'interim')

# Download data

Download all needed data and clean it.

## Bike Station Details data

### Download

In [4]:
bike_stations_path = os.path.join(project_root_folder, 'data', 'raw', 'bike_stations.csv')

if not os.path.exists(bike_stations_path):
    stations_data = get_bike_stations_data()
    stations_data.to_csv(bike_stations_path)
else:
    stations_data = pd.read_csv(bike_stations_path)

### Clean

In [5]:
# drop first column
stations_data.drop(stations_data.columns[0], axis=1, inplace=True)

print(stations_data.columns)

Index(['Station no', 'Nazwa stacji', 'Available bikes', 'Coordinates',
       'Racks count', 'Numers of available bikes'],
      dtype='object')


In [6]:
# Remove records that have no station number. 
# These records concern bikes that were returned to some places outside bike stations.
stations_data.dropna(subset=['Station no'], inplace=True)

In [7]:
# Get only bike station name and coordinates
stations_data = stations_data[['Nazwa stacji', 'Coordinates']]

In [8]:
# rename columns to make it legible for English-speaking audience
stations_data.columns = ['Bike station', 'Coordinates']

In [9]:
stations_data.head()

Unnamed: 0,Bike station,Coordinates
0,Plac Dominikański (Galeria Dominikańska),"51.108004, 17.039528"
1,"Dworzec Główny, południe","51.097108064432, 17.036109566688538"
2,Rynek,"51.109782, 17.030175"
3,Dworzec Główny,"51.09975, 17.036228"
4,Nowowiejska / Jedności Narodowej,"51.124879, 17.045844"


In [10]:
gps_coordinates = stations_data['Coordinates'].str.split(', ', n = 1, expand = True)

In [11]:
stations_data.loc[:, 'Latitude'] = gps_coordinates[0]
stations_data.loc[:, 'Longitude'] = gps_coordinates[1]

In [12]:
stations_data.drop("Coordinates", axis=1, inplace=True)

In [13]:
stations_data

Unnamed: 0,Bike station,Latitude,Longitude
0,Plac Dominikański (Galeria Dominikańska),51.108004,17.039528
1,"Dworzec Główny, południe",51.097108064432,17.036109566688538
2,Rynek,51.109782,17.030175
3,Dworzec Główny,51.09975,17.036228
4,Nowowiejska / Jedności Narodowej,51.124879,17.045844
...,...,...,...
198,pl. Orląt Lwowskich,51.10823,17.02138
199,Komandorska / Kamienna,51.09009,17.0235
200,"Wrocław Stadion, stacja kolejowa",51.13707,16.94095
201,"Wrocław Leśnica, stacja kolejowa",51.14323,16.86627


### Save interim results

In [14]:
bike_station_details_interim_path = os.path.join(data_interim_folder, 'bike_station_coordinates.csv')
stations_data.to_csv(path_or_buf=bike_station_details_interim_path, index=False)

## Bike rental records

### Download

In [15]:
# Provide config details for the Azure Storage container with data.
account_name='<storage_account_name>'
account_key='<storage_account_key>'
container_name='<storage_container_name>' 

# Alternatively, they can be grabbed from the 'local.settings.json' file 
# (used by Azure Functions), if it exists.
local_settings_file_path = os.path.join(project_root_folder, 
                                        'src', 'azurefunctions', 'local.settings.json')
if os.path.exists(local_settings_file_path):
    with open(local_settings_file_path, 'r') as f:
        local_settings = json.load(f)
        
    account_name = local_settings['Values']['storage_account_name']
    account_key = local_settings['Values']['storage_account_key']
    container_name = local_settings['Values']['storage_container_name']

In [16]:
# Download all data from Azure Blob Storage and save it locally 
blob_downloader = BlobDownloader(account_name, account_key, container_name)
blob_downloader.download_blobs_and_save(data_raw_folder)

File already downloaded: Historia_przejazdow_2019-10-10_15_37_27.csv
File already downloaded: Historia_przejazdow_2019-10-11_15_34_34.csv
File already downloaded: Historia_przejazdow_2019-10-16_14_42_31.csv
File already downloaded: Historia_przejazdow_2019-10-17_14_45_15.csv
File already downloaded: Historia_przejazdow_2019-10-18_14_45_31.csv
File already downloaded: Historia_przejazdow_2019-10-19_14_44_34.csv
File already downloaded: Historia_przejazdow_2019-10-1_15_28_35.csv
File already downloaded: Historia_przejazdow_2019-10-20_14_46_37.csv
File already downloaded: Historia_przejazdow_2019-10-21_14_49_54.csv
File already downloaded: Historia_przejazdow_2019-10-22_14_49_13.csv
File already downloaded: Historia_przejazdow_2019-10-23_14_48_12.csv
File already downloaded: Historia_przejazdow_2019-10-2_15_32_45.csv
File already downloaded: Historia_przejazdow_2019-10-3_15_31_46.csv
File already downloaded: Historia_przejazdow_2019-10-4_15_31_0.csv
File already downloaded: Historia_przej

### Load multiple csv files

In [17]:
dfs = []

# Get filenames and load data to 
for filename in glob.glob(os.path.join(data_raw_folder, 'Historia_przejazdow_*.csv')):
    dfs.append(pd.read_csv(filename, parse_dates=['Data wynajmu', 'Data zwrotu']))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

In [18]:
print(big_frame.shape)

(5553983, 7)


In [19]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5553983 entries, 0 to 5553982
Data columns (total 7 columns):
UID wynajmu       int64
Numer roweru      int64
Data wynajmu      datetime64[ns]
Data zwrotu       datetime64[ns]
Stacja wynajmu    object
Stacja zwrotu     object
Czas trwania      object
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 296.6+ MB


### Clean

#### Drop duplicates

In [20]:
# Last record is considered as unique and rest of the same values as duplicate
big_frame.drop_duplicates(subset="UID wynajmu", keep='last', inplace=True)

# Sort it
big_frame.sort_values(by='UID wynajmu', inplace=True)

In [21]:
print(big_frame.shape)

(422385, 7)


In [22]:
big_frame.head()

Unnamed: 0,UID wynajmu,Numer roweru,Data wynajmu,Data zwrotu,Stacja wynajmu,Stacja zwrotu,Czas trwania
5541096,76160681,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09
5541097,76160684,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46
5541098,76160686,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17
5541099,76160697,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32
5541100,76160714,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00


In [23]:
big_frame.tail()

Unnamed: 0,UID wynajmu,Numer roweru,Data wynajmu,Data zwrotu,Stacja wynajmu,Stacja zwrotu,Czas trwania
2243494,88181620,650421,2019-10-21 23:48:00,2019-10-21 23:59:00,Plac Grunwaldzki (DS Ołówek),Powstańców Śląskich (Arkady Wrocławskie),00:11:00
2243495,88181629,650725,2019-10-21 23:48:00,2019-10-21 23:59:00,Rynek,Gliniana / Gajowa,00:11:00
2243496,88181658,57266,2019-10-21 23:49:00,2019-10-21 23:58:00,Tarnogajska / Klimasa,al. Armii Krajowej / Borowska,00:09:00
2243497,88181742,57610,2019-10-21 23:53:00,2019-10-21 23:57:00,Grabiszyńska / Stalowa,Krucza / Mielecka / Stalowa,00:04:00
2243498,88181774,57494,2019-10-21 23:54:00,2019-10-21 23:54:00,Plac Grunwaldzki / Polaka,Plac Grunwaldzki / Polaka,00:00:00


In [24]:
# rename columns to make it legible for English-speaking audience
big_frame.columns = ['UID', 'Bike number', 'Rental datetime', 'Return datetime', 
                     'Rental station', 'Return station', 'Duration']

In [25]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 422385 entries, 5541096 to 2243498
Data columns (total 7 columns):
UID                422385 non-null int64
Bike number        422385 non-null int64
Rental datetime    422385 non-null datetime64[ns]
Return datetime    422385 non-null datetime64[ns]
Rental station     422385 non-null object
Return station     422385 non-null object
Duration           422385 non-null object
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 25.8+ MB


### Save interim results

In [26]:
# write interim file
bike_rentals_interim_path = os.path.join(data_interim_folder, 'bike_rental_records.csv')
big_frame.to_csv(path_or_buf=bike_rentals_interim_path, index=False)

# Load data

## Bike rentals records

In [27]:
# Load data and set proper data types
bike_rentals_df = pd.read_csv(bike_rentals_interim_path, 
                              parse_dates=['Rental datetime', 'Return datetime'], 
                              index_col='UID')

bike_rentals_df['Duration'] = pd.to_timedelta(bike_rentals_df['Duration'])

In [28]:
# just checking - to make sure data types are as we want
bike_rentals_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 422385 entries, 76160681 to 88181774
Data columns (total 6 columns):
Bike number        422385 non-null int64
Rental datetime    422385 non-null datetime64[ns]
Return datetime    422385 non-null datetime64[ns]
Rental station     422385 non-null object
Return station     422385 non-null object
Duration           422385 non-null timedelta64[ns]
dtypes: datetime64[ns](2), int64(1), object(2), timedelta64[ns](1)
memory usage: 22.6+ MB


In [29]:
bike_rentals_df.sample(5)

Unnamed: 0_level_0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
80980874,57517,2019-08-06 14:19:00,2019-08-06 14:31:00,Fabryczna / Wagonowa,Zaporoska / Grabiszyńska,00:12:00
82267013,57991,2019-08-18 12:21:00,2019-08-18 12:38:00,Świeradowska / Krynicka,Buforowa - Vivaldiego,00:17:00
82267150,57888,2019-08-18 12:23:00,2019-08-18 13:03:00,Aleja Kromera,Krzywoustego / Korona,00:40:00
85625533,650826,2019-09-19 16:39:00,2019-09-19 17:10:00,Poza oficjalną stacją,Pilczycka / Kozanowska,00:31:00
83178338,650180,2019-08-26 17:49:00,2019-08-26 18:14:00,Osobowicka - pętla tramwajowa,Żmigrodzka / Marino,00:25:00


In [30]:
# check if we have any missing data
bike_rentals_df.isna().sum()

Bike number        0
Rental datetime    0
Return datetime    0
Rental station     0
Return station     0
Duration           0
dtype: int64

## Bike station details

In [31]:
bike_stations_df = pd.read_csv(bike_station_details_interim_path)

In [32]:
bike_stations_df

Unnamed: 0,Bike station,Latitude,Longitude
0,Plac Dominikański (Galeria Dominikańska),51.108004,17.039528
1,"Dworzec Główny, południe",51.097108,17.036110
2,Rynek,51.109782,17.030175
3,Dworzec Główny,51.099750,17.036228
4,Nowowiejska / Jedności Narodowej,51.124879,17.045844
...,...,...,...
198,pl. Orląt Lwowskich,51.108230,17.021380
199,Komandorska / Kamienna,51.090090,17.023500
200,"Wrocław Stadion, stacja kolejowa",51.137070,16.940950
201,"Wrocław Leśnica, stacja kolejowa",51.143230,16.866270


In [33]:
# check if we have any missing data
bike_stations_df.isna().sum()

Bike station    0
Latitude        0
Longitude       0
dtype: int64

## Join datasets

In [34]:
bike_rentals_df = pd.merge(bike_rentals_df, 
                           bike_stations_df, 
                           how='left',
                           left_on='Rental station',
                           right_on='Bike station')

In [35]:
# define more meaningful names for coordinates columns
cols = [f'Rental station {str.lower(column)}' 
            if column in ['Latitude', 'Longitude'] else column
                for column in bike_rentals_df.columns]
bike_rentals_df.columns = cols

# drop unnecessary
bike_rentals_df.drop(['Bike station'], axis=1, inplace=True)

In [36]:
bike_rentals_df.head()

Unnamed: 0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration,Rental station latitude,Rental station longitude
0,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09,51.07329,16.99485
1,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46,51.109782,17.030175
2,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17,,
3,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32,51.113871,17.034484
4,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00,51.099713,17.027905


In [37]:
bike_rentals_df = pd.merge(bike_rentals_df, 
                           bike_stations_df, 
                           how='left',
                           left_on='Return station',
                           right_on='Bike station')

In [38]:
# define more meaningful names for coordinates columns
cols = [f'Return station {str.lower(column)}' 
            if column in ['Latitude', 'Longitude'] else column
                for column in bike_rentals_df.columns]
bike_rentals_df.columns = cols

# drop unnecessary
bike_rentals_df.drop(['Bike station'], axis=1, inplace=True)

In [39]:
bike_rentals_df.head()

Unnamed: 0,Bike number,Rental datetime,Return datetime,Rental station,Return station,Duration,Rental station latitude,Rental station longitude,Return station latitude,Return station longitude
0,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09,51.07329,16.99485,51.074992,17.007058
1,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46,51.109782,17.030175,51.104413,17.022536
2,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17,,,51.065777,16.988575
3,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32,51.113871,17.034484,51.125276,16.984447
4,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00,51.099713,17.027905,51.099713,17.027905


In [40]:
# Check if we have any missing data:
# - if rental station coordinates are missing -> bikes were rented outside official bike stations
# - if return station coordinates are missing -> bikes were returned outside official bike stations
bike_rentals_df.isna().sum()

Bike number                     0
Rental datetime                 0
Return datetime                 0
Rental station                  0
Return station                  0
Duration                        0
Rental station latitude     54901
Rental station longitude    54901
Return station latitude     52171
Return station longitude    52171
dtype: int64