# Data Processing

The purpose of this notebook is to download data and process it. As a result we will get a dataset ready for further analyses and modeling.

# Table of Contents

[1. Imports & Environment Configuration](#Imports-&-Environment-Configuration)

# Imports & Environment Configuration

In [1]:
%load_ext autoreload

import glob
import json
import matplotlib.pyplot as plt
import os
import pandas as pd

if os.path.basename(os.getcwd()) == 'notebooks':
    # Make it possible to import modules
    import sys
    sys.path.append("../") 

from src.data.blob_downloader import BlobDownloader

In [2]:
# let matplotlib plots be part of Jupyter Notebook
%matplotlib inline

# set up higher resolution for matplotlib plots
%config InlineBackend.figure_format = 'retina'

## Environment Configuration Variables

In [3]:
# Determine the root folder of the project. The current working directory 
# may vary depending on whether the notebook is run in Jupyter Notebook or VS Code
if os.path.basename(os.getcwd()) == 'notebooks':
    project_root_folder = os.path.abspath(os.path.join(os.getcwd(), '..'))   
else:
    project_root_folder = os.getcwd()
    
# Set up paths to data folders
data_raw_folder = os.path.join(project_root_folder, 'data', 'raw')

# Load data

## Download data from the Azure Blob Storage

In [4]:
# Provide config details for the Azure Storage container with data.
account_name='<storage_account_name>'
account_key='<storage_account_key>'
container_name='<storage_container_name>' 

# Alternatively, they can be grabbed from the 'local.settings.json' file (used by Azure Functions), if it exists.
local_settings_file_path = os.path.join(project_root_folder, 'src', 'azurefunctions', 'local.settings.json')
if os.path.exists(local_settings_file_path):
    with open(local_settings_file_path, 'r') as f:
        local_settings = json.load(f)
        
    account_name = local_settings['Values']['storage_account_name']
    account_key = local_settings['Values']['storage_account_key']
    container_name = local_settings['Values']['storage_container_name']

In [5]:
# Download all data from Azure Blob Storage and save it locally 
blob_downloader = BlobDownloader(account_name, account_key, container_name)
blob_downloader.download_blobs_from_storage_and_save_to_folder(data_raw_folder)

File already downloaded: Historia_przejazdow_2019-10-10_15_37_27.csv
File already downloaded: Historia_przejazdow_2019-10-11_15_34_34.csv
File already downloaded: Historia_przejazdow_2019-10-16_14_42_31.csv
File already downloaded: Historia_przejazdow_2019-10-17_14_45_15.csv
File already downloaded: Historia_przejazdow_2019-10-18_14_45_31.csv
File already downloaded: Historia_przejazdow_2019-10-19_14_44_34.csv
File already downloaded: Historia_przejazdow_2019-10-1_15_28_35.csv
File already downloaded: Historia_przejazdow_2019-10-2_15_32_45.csv
File already downloaded: Historia_przejazdow_2019-10-3_15_31_46.csv
File already downloaded: Historia_przejazdow_2019-10-4_15_31_0.csv
File already downloaded: Historia_przejazdow_2019-10-5_15_30_33.csv
File already downloaded: Historia_przejazdow_2019-10-6_15_31_37.csv
File already downloaded: Historia_przejazdow_2019-10-7_15_31_37.csv
File already downloaded: Historia_przejazdow_2019-10-8_15_32_34.csv
File already downloaded: Historia_przejazdo

## Load multiple csv files

In [6]:
dfs = []

# Get filenames and load data to 
for filename in glob.glob(os.path.join(data_raw_folder, '*.csv')):
    dfs.append(pd.read_csv(filename, parse_dates=['Data wynajmu', 'Data zwrotu']))

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)

In [7]:
print(big_frame.shape)

(5524358, 7)


In [8]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5524358 entries, 0 to 5524357
Data columns (total 7 columns):
UID wynajmu       int64
Numer roweru      int64
Data wynajmu      datetime64[ns]
Data zwrotu       datetime64[ns]
Stacja wynajmu    object
Stacja zwrotu     object
Czas trwania      object
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 295.0+ MB


## Drop duplicates

In [9]:
# Last record is considered as unique and rest of the same values as duplicate
big_frame.drop_duplicates(subset="UID wynajmu", keep='last', inplace=True)

# Sort it
big_frame.sort_values(by='UID wynajmu', inplace=True)

In [10]:
print(big_frame.shape)

(407576, 7)


In [11]:
big_frame.head()

Unnamed: 0,UID wynajmu,Numer roweru,Data wynajmu,Data zwrotu,Stacja wynajmu,Stacja zwrotu,Czas trwania
5511471,76160681,57719,2019-06-26 00:00:10,2019-06-26 00:06:19,Skarbowców / Wietrzna,Krzycka / Aleja Karkonoska (Park Południowy),00:06:09
5511472,76160684,650480,2019-06-26 00:00:13,2019-06-26 00:06:59,Rynek,Plac Legionów,00:06:46
5511473,76160686,650988,2019-06-26 00:00:15,2019-06-26 00:13:32,Poza oficjalną stacją,Wałbrzyska - pętla tramwajowa,00:13:17
5511474,76160697,57603,2019-06-26 00:00:21,2019-06-26 00:23:53,Plac Uniwersytecki (UWr),Legnicka / Wejherowska,00:23:32
5511475,76160714,650067,2019-06-26 00:00:40,2019-06-26 00:04:40,Powstańców Śląskich (Arkady Wrocławskie),Powstańców Śląskich (Arkady Wrocławskie),00:04:00


In [12]:
big_frame.tail()

Unnamed: 0,UID wynajmu,Numer roweru,Data wynajmu,Data zwrotu,Stacja wynajmu,Stacja zwrotu,Czas trwania
4320987,87836299,57011,2019-10-17 23:47:00,2019-10-17 23:52:00,Drobnera / Plac Bema,Sienkiewicza / Piastowska,00:05:00
4320988,87836359,651021,2019-10-17 23:49:00,2019-10-17 23:58:00,Dworzec Główny,Nyska / Jesionowa,00:09:00
4320989,87836392,650485,2019-10-17 23:51:00,2019-10-17 23:57:00,Plac Grunwaldzki / Polaka,Plac Dominikański (Galeria Dominikańska),00:06:00
4320990,87836437,57735,2019-10-17 23:52:00,2019-10-17 23:59:00,Skarbowców / Wietrzna,Skarbowców / Wietrzna,00:07:00
4320991,87836540,57059,2019-10-17 23:56:00,2019-10-17 23:56:00,Ślężna / Kamienna (Uniw. Ekonomiczny),Ślężna / Kamienna (Uniw. Ekonomiczny),00:00:00


In [13]:
big_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 407576 entries, 5511471 to 4320991
Data columns (total 7 columns):
UID wynajmu       407576 non-null int64
Numer roweru      407576 non-null int64
Data wynajmu      407576 non-null datetime64[ns]
Data zwrotu       407576 non-null datetime64[ns]
Stacja wynajmu    407576 non-null object
Stacja zwrotu     407576 non-null object
Czas trwania      407576 non-null object
dtypes: datetime64[ns](2), int64(2), object(3)
memory usage: 24.9+ MB
