In [1]:
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

import os
import time
import datetime
import urllib.request
import pytz

In [None]:
# fill next few rows with actual info
# by defalut, will be downloaded for Oslo, all existing data (since 2019-04 till 2022-06) into folder ./data.
city_name = 'oslo'        # можливі варіанти: olso, bergen, trondheim

start_month = 4           # 1 - 12
start_year = 2019         # 2019 - 2022

end_month = 6             # 1 - 12
end_year = 2022           # 2019 - 2022

destination_folder = '/Users/chedmurray/Desktop/безіменна папка 2'
# end filling

In [None]:
def to_date_str(date_arr):
    year = str(date_arr[0])
    month = ('0' + str(date_arr[1]))[-2:]
    return year + '/' + month

def add_month(date_str):
    date_arr = [int(i) for i in date_str.split('/')]
    if date_arr[1] != 12:
        date_arr[1] += 1
    else:
        date_arr[1] = 1
        date_arr[0] += 1
    return to_date_str(date_arr)

def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename, 'wb')
    file.write(response.read())
    file.close()

In [None]:
# generate all possible month-year combinations within the selected period
cur_date_str = to_date_str([start_year, start_month])
end_date_str = to_date_str([end_year, end_month])
dates_str_list = list()
if cur_date_str == end_date_str:
    dates_str_list = [cur_date_str]
else:
    while True:
        dates_str_list.append(cur_date_str)
        cur_date_str = add_month(cur_date_str)
        if cur_date_str == end_date_str:
            dates_str_list.append(cur_date_str)
            break

In [None]:
# download files
start = time.time()
if os.path.exists(destination_folder):
    for date in dates_str_list:
        download_url = f"https://data.urbansharing.com/{city_name}bysykkel.no/trips/v1/{date}.csv"
        filename = f"{city_name}_{date.replace('/', '_')}.csv"
        print("in progres: ", filename[:-4])
        destination_full_path = '/'.join([destination_folder, filename])
        download_file(download_url, destination_full_path)
else:
    print("Destination folder not found")
end = time.time()
print('Time: ', end - start, ' s')

In [None]:
#dataset creating 

os.chdir("/Users/chedmurray/Desktop/безіменна папка 2") # working directory
csv = !ls

dataframes = (pd.read_csv(f) for f in csv)
df = pd.concat(dataframes, ignore_index=True)
df.info()

In [None]:
# data transformation

# new timzone 
strptime_pattern = "%Y-%m-%d %H:%M:%S%z"
timezone = pytz.timezone('Europe/Oslo')

start = time.time()
df['started_at_dt'] = df['started_at'].apply(lambda x: datetime.datetime.strptime(x[:-6].split('.')[0] + x[-6:], strptime_pattern).astimezone(timezone))
df['ended_at_dt'] = df['ended_at'].apply(lambda x: datetime.datetime.strptime(x[:-6].split('.')[0] + x[-6:], strptime_pattern).astimezone(timezone))
end = time.time()
print("Трансформування: ", end - start, "с.")
df.head()

#add new columns 

df['day_of_week'] = df['started_at_dt'].dt.day_name()
df['month']=df['started_at_dt'].map(lambda x:x.month)
df['year']=df['started_at_dt'].map(lambda x:x.year)
df['hour']=df['started_at_dt'].map(lambda x:x.hour)
df['date']=df['started_at_dt'].map(lambda x:x.date())

seasons = {(1, 12, 2): 'winter', (3, 4, 5): 'spring', (6, 7, 8): 'summer', (9, 10, 11): 'autumn'}

def season(ser):
    for k in seasons.keys():
        if ser in k:
            return seasons[k]

df['season'] = df['month'].apply(season)

df['duration'] = df['duration'].astype(int)

df.drop(labels=['started_at', 'ended_at'], axis=1, inplace=True)



In [None]:
# info by station

by_station = df[['start_station_name', 'duration' ]].groupby(['start_station_name']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var)).reset_index()

station_by_year = df[['start_station_name', 'year', 'duration' ]].groupby(['start_station_name', 'year']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var))

station_by_season = df[['start_station_name', 'season', 'duration' ]].groupby(['start_station_name', 'season']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var))

station_by_month = df[['start_station_name', 'month', 'duration' ]].groupby(['start_station_name', 'month']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var))

station_by_day_of_week = df[['start_station_name', 'day_of_week', 'duration' ]].groupby(['start_station_name', 'day_of_week']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var))

station_by_hour = df[['start_station_name', 'hour', 'duration' ]].groupby(['start_station_name', 'hour']).agg(
    number_of_trips=('duration', 'count'),
    min_duration=('duration', min),
    max_duration=('duration', max), 
    mean_duration=('duration', np.mean), 
    median_duration=('duration', np.median),
    standard_deviation_duration=('duration', np.std),
    variance_quantil=('duration', np.var)).reset_index()

