In [None]:
import requests
import pandas as pd

# CoinGecko API endpoint for simple price data
url = "https://api.coingecko.com/api/v3/simple/price"

# Parameters: which coins and currencies you want
params = {
    "ids": "bitcoin,ethereum,cardano",  # coins
    "vs_currencies": "usd",            # currency
    "include_market_cap": "true",      # optional: include market cap
    "include_24hr_vol": "true",        # optional: include 24h volume
    "include_24hr_change": "true"      # optional: include 24h change
}

# Make the GET request
response = requests.get(url, params=params)
data = response.json()  # Convert JSON to Python dictionary

# Convert to pandas DataFrame
df = pd.DataFrame(data).T  # Transpose to have coins as rows
df.reset_index(inplace=True)
df.rename(columns={"index": "coin"}, inplace=True)

print(df)


In [None]:
import requests
import pandas as pd
from datetime import datetime

# CoinGecko API endpoint for historical market data
url = "https://api.coingecko.com/api/v3/coins/bitcoin/market_chart"

# Parameters
params = {
    "vs_currency": "usd",  # currency
    "days": "30",          # last 30 days
}

# Make the GET request
response = requests.get(url, params=params)
data = response.json()

# The 'prices' field contains a list of [timestamp, price]
prices = data['prices']

# Convert to DataFrame
df = pd.DataFrame(prices, columns=["timestamp", "price_usd"])

# Convert timestamp from milliseconds to readable date
df['date'] = pd.to_datetime(df['timestamp'], unit='ms')

# Optional: set date as index
df.set_index('date', inplace=True)
df.drop('timestamp', axis=1, inplace=True)

print(df.head())


In [None]:
import requests
import pandas as pd

# PokéAPI endpoint for a specific Pokémon
pokemon_name = "pikachu"
url = f"https://pokeapi.co/api/v2/pokemon/{pokemon_name}"

# Make the GET request
response = requests.get(url)
data = response.json()

# Extract some useful info: stats, types, abilities
pokemon_info = {
    "name": data["name"],
    "height": data["height"],
    "weight": data["weight"],
    "types": [t["type"]["name"] for t in data["types"]],
    "abilities": [a["ability"]["name"] for a in data["abilities"]],
    "base_stats": {stat["stat"]["name"]: stat["base_stat"] for stat in data["stats"]}
}

# Convert base stats to a DataFrame for easier analysis
stats_df = pd.DataFrame.from_dict(pokemon_info["base_stats"], orient="index", columns=["base_stat"])
stats_df.index.name = "stat"
stats_df.reset_index(inplace=True)

print(pokemon_info)
print(stats_df)


-------

## PID DATA

### stahování

In [45]:
import requests
import zipfile
import io
import pandas as pd

GTFS_URL = "http://data.pid.cz/PID_GTFS.zip"

def download_and_extract(gtfs_url):
    response = requests.get(gtfs_url)
    if response.status_code != 200:
        raise Exception(f"Failed to download GTFS: {response.status_code}")
    print("Download successful — size:", len(response.content), "bytes")

    # Read zip file from bytes
    z = zipfile.ZipFile(io.BytesIO(response.content))
    print("Files in ZIP:", z.namelist())

    # Optionally extract to folder
    # z.extractall("pid_gtfs")

    # Or directly read some CSVs into pandas
    data = {}
    for fname in ["stops.txt", "routes.txt", "trips.txt", "stop_times.txt", "calendar.txt", "calendar_dates.txt", "transfers.txt"]:
        if fname in z.namelist():
            with z.open(fname) as f:
                df = pd.read_csv(f, dtype=str)  # use dtype=str to avoid mixed types
                data[fname] = df
                print(f"Loaded {fname} — {len(df)} rows")
        else:
            print(f"Warning: {fname} not found in ZIP")

    return data

if __name__ == "__main__":
    data = download_and_extract(GTFS_URL)

    # Example: show first few stops
    # print(data["stops.txt"].head())
    # Example: show first few routes
    # print(data["routes.txt"].head())
    # Example: show first few trips
    # print(data["trips.txt"].head())


Download successful — size: 43040511 bytes
Files in ZIP: ['agency.txt', 'calendar.txt', 'calendar_dates.txt', 'fare_attributes.txt', 'fare_rules.txt', 'feed_info.txt', 'levels.txt', 'pathways.txt', 'routes.txt', 'route_stops.txt', 'route_sub_agencies.txt', 'shapes.txt', 'stops.txt', 'stop_times.txt', 'transfers.txt', 'trips.txt', 'vehicle_allocations.txt', 'vehicle_boardings.txt', 'vehicle_categories.txt', 'vehicle_couplings.txt']
Loaded stops.txt — 18529 rows
Loaded routes.txt — 849 rows
Loaded trips.txt — 72199 rows
Loaded stop_times.txt — 1459306 rows
Loaded calendar.txt — 548 rows
Loaded calendar_dates.txt — 10448 rows
Loaded transfers.txt — 9879 rows


In [23]:
# Assuming you already loaded the GTFS data
stops_df = data["stops.txt"]

# Look for a stop called "Muzeum" (exact match)
muzeum_stop = stops_df[stops_df["stop_name"] == "Muzeum"]
print(muzeum_stop)

         stop_id stop_name   stop_lat   stop_lon zone_id stop_url  \
1308     U400Z3P    Muzeum  50.080032  14.430883       P      NaN   
1309     U400Z4P    Muzeum  50.079617  14.433166       P      NaN   
1310     U400Z5P    Muzeum  50.078922  14.433838       P      NaN   
1311     U400Z6P    Muzeum  50.078777  14.434217       P      NaN   
1312   U400Z101P    Muzeum  50.079413  14.431540       P      NaN   
1313   U400Z102P    Muzeum  50.079530  14.431780       P      NaN   
1314   U400Z121P    Muzeum  50.080291  14.430980       P      NaN   
1315   U400Z122P    Muzeum  50.080191  14.431195       P      NaN   
18017     U400S1    Muzeum  50.080048  14.430857       P      NaN   

      location_type parent_station wheelchair_boarding level_id platform_code  \
1308              0            NaN                   0      NaN             C   
1309              0            NaN                   0      NaN             D   
1310              0            NaN                   2      NaN   

In [24]:
stops_df.columns

Index(['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'zone_id', 'stop_url',
       'location_type', 'parent_station', 'wheelchair_boarding', 'level_id',
       'platform_code', 'asw_node_id', 'asw_stop_id', 'zone_region_type'],
      dtype='object')

In [11]:
routes_df = data["routes.txt"]

routes_df.columns

Index(['route_id', 'agency_id', 'route_short_name', 'route_long_name',
       'route_type', 'route_url', 'route_color', 'route_text_color',
       'is_night', 'is_regional', 'is_substitute_transport'],
      dtype='object')

In [14]:
routes_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq
route_id,843,843,L1399,1
agency_id,843,1,99,843
route_short_name,843,838,452,2
route_long_name,838,831,"Praha,Smíchovské nádraží - Mníšek pod Brdy - D...",2
route_type,843,6,3,709
route_url,830,828,https://pid.cz/linka/452,2
route_color,843,8,007DA8,709
route_text_color,843,3,FFFFFF,826
is_night,843,2,0,804
is_regional,843,2,1,583


In [15]:
routes_df.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_url,route_color,route_text_color,is_night,is_regional,is_substitute_transport
0,L991,99,A,Nemocnice Motol - Petřiny - Skalka - Depo Host...,1,https://pid.cz/linka/A,00A562,FFFFFF,0,0,0
1,L992,99,B,Zličín - Černý Most,1,https://pid.cz/linka/B,F8B322,000000,0,0,0
2,L993,99,C,Letňany - Ládví - Háje,1,https://pid.cz/linka/C,CF003D,FFFFFF,0,0,0
3,L1,99,1,Sídliště Petřiny - Výstaviště,0,https://pid.cz/linka/1,7A0603,FFFFFF,0,0,0
4,L2,99,2,Sídliště Petřiny - Nádraží Braník,0,https://pid.cz/linka/2,7A0603,FFFFFF,0,0,0


In [17]:
trips_df = data["trips.txt"]

trips_df.columns

Index(['route_id', 'service_id', 'trip_id', 'trip_headsign', 'trip_short_name',
       'direction_id', 'block_id', 'shape_id', 'wheelchair_accessible',
       'bikes_allowed', 'exceptional', 'sub_agency_id', 'headsign_icons'],
      dtype='object')

In [18]:
trips_df.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,exceptional,sub_agency_id,headsign_icons
0,L991,1111100-1,991_9400_250901,Nemocnice Motol,,0,,L991V1,1,1,0,1,
1,L991,1111100-1,991_9401_250901,Depo Hostivař,,1,,L991V2,1,1,0,1,
2,L991,1111100-1,991_9402_250901,Nemocnice Motol,,0,,L991V3,1,1,0,1,
3,L991,1111100-1,991_9403_250901,Skalka,,1,,L991V4,1,1,0,1,
4,L991,1111100-1,991_9404_250901,Nemocnice Motol,,0,,L991V1,1,1,0,1,


In [20]:
trips_df[trips_df["route_id"] == "L991"]

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,exceptional,sub_agency_id,headsign_icons
0,L991,1111100-1,991_9400_250901,Nemocnice Motol,,0,,L991V1,1,1,0,1,
1,L991,1111100-1,991_9401_250901,Depo Hostivař,,1,,L991V2,1,1,0,1,
2,L991,1111100-1,991_9402_250901,Nemocnice Motol,,0,,L991V3,1,1,0,1,
3,L991,1111100-1,991_9403_250901,Skalka,,1,,L991V4,1,1,0,1,
4,L991,1111100-1,991_9404_250901,Nemocnice Motol,,0,,L991V1,1,1,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2710,L991,0000001-1,991_656_251005,Nemocnice Motol,,0,,L991V6,1,1,0,1,
2711,L991,0000001-1,991_657_251005,Depo Hostivař,,1,,L991V5,1,1,0,1,
2712,L991,0000001-1,991_658_251005,Nemocnice Motol,,0,,L991V6,1,1,0,1,
2713,L991,0000001-1,991_9144_251005,Depo Hostivař,,1,,L991V5,1,1,0,1,


In [21]:
stop_times_df = data["stop_times.txt"]

stop_times_df.columns

Index(['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'stop_headsign', 'pickup_type', 'drop_off_type', 'shape_dist_traveled',
       'trip_operation_type', 'bikes_allowed', 'stop_icons', 'headsign_icons'],
      dtype='object')

In [22]:
stop_times_df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,trip_operation_type,bikes_allowed,stop_icons,headsign_icons
0,991_9400_250901,07:29:15,07:29:15,U953Z102P,1,,0,0,0.0,1,1,,
1,991_9400_250901,07:31:15,07:31:35,U713Z102P,2,,0,0,1.401974,1,1,,
2,991_9400_250901,07:33:00,07:33:20,U921Z102P,3,,0,0,2.627027,1,1,,
3,991_9400_250901,07:34:35,07:34:55,U118Z102P,4,,0,0,3.595281,1,1,,
4,991_9400_250901,07:36:05,07:36:25,U209Z102P,5,,0,0,4.439155,1,1,,


In [26]:
calendar_df = data["calendar.txt"]

calendar_df.columns

Index(['service_id', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
       'saturday', 'sunday', 'start_date', 'end_date'],
      dtype='object')

In [30]:
calendar_df.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1111100-1,1,1,1,1,1,0,0,20260107,20260120
1,1111111-1,1,1,1,1,1,1,1,20260107,20260120
2,1111111-2,1,1,1,1,1,1,1,20260107,20260120
3,1111101-1,1,1,1,1,1,0,1,20260107,20260120
4,1111110-1,1,1,1,1,1,1,0,20260107,20260120


### úprava dat
(zkopírováno z Githubu)

In [33]:
df = data["routes.txt"]

# dealing with missing long names
df["route_long_name"] = df["route_long_name"].fillna("")

# changing type to numeric for 0/1 flag columns and route_type
for c in ["route_type", "is_night", "is_regional", "is_substitute_transport"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# only keeping useful columns
keep = [
    "route_id",
    "route_short_name",
    "route_long_name",
    "route_type",
    "is_night",
    "is_regional",
    "is_substitute_transport",
]
keep = [c for c in keep if c in df.columns]
routes_transformed = df[keep].copy()

In [34]:
routes_transformed.head()

Unnamed: 0,route_id,route_short_name,route_long_name,route_type,is_night,is_regional,is_substitute_transport
0,L991,A,Nemocnice Motol - Petřiny - Skalka - Depo Host...,1,0,0,0
1,L992,B,Zličín - Černý Most,1,0,0,0
2,L993,C,Letňany - Ládví - Háje,1,0,0,0
3,L1,1,Sídliště Petřiny - Výstaviště,0,0,0,0
4,L2,2,Sídliště Petřiny - Nádraží Braník,0,0,0,0


In [37]:
df = data["calendar.txt"]

# changing type to numeric for flag 0/1 columns
day_cols = ["monday","tuesday","wednesday","thursday","friday","saturday","sunday"]
for c in day_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

calendar_transformed = df

In [38]:
calendar_transformed.head()

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
0,1111100-1,1,1,1,1,1,0,0,20260119,20260130
1,1111111-1,1,1,1,1,1,1,1,20260119,20260201
2,1111101-1,1,1,1,1,1,0,1,20260119,20260201
3,1111110-1,1,1,1,1,1,1,0,20260119,20260131
4,0000010-1,0,0,0,0,0,1,0,20260124,20260131


In [40]:
df = data["stop_times.txt"]

# changing type to numeric for purely numeric columns 
for c in ["stop_sequence", "pickup_type", "drop_off_type"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# only keeping useful columns
keep = [
    "trip_id",
    "stop_id",
    "stop_sequence",
    "arrival_time",
    "departure_time",
    "pickup_type",
    "drop_off_type",
]
keep = [c for c in keep if c in df.columns]
stop_times_transformed = df[keep].copy()

In [41]:
stop_times_transformed.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,departure_time,pickup_type,drop_off_type
0,991_9400_250901,U953Z102P,1,07:29:15,07:29:15,0,0
1,991_9400_250901,U713Z102P,2,07:31:15,07:31:35,0,0
2,991_9400_250901,U921Z102P,3,07:33:00,07:33:20,0,0
3,991_9400_250901,U118Z102P,4,07:34:35,07:34:55,0,0
4,991_9400_250901,U209Z102P,5,07:36:05,07:36:25,0,0


In [42]:
df = data["stops.txt"]

# location_type normalization
df["location_type"] = df["location_type"].replace({"": "0"}).fillna("0")
df["location_type"] = pd.to_numeric(df["location_type"], errors="coerce").fillna(0).astype(int)

# keep only 0 (stop) / 1 (station)
df = df[df["location_type"].isin([0, 1])].copy()

# setting parent station (if exists -> parent_station, if not -> fallback to stop_id )
df["parent_station"] = df["parent_station"].replace({"": pd.NA})
df["parent_station"] = df["parent_station"].fillna(df["stop_id"])

# does parent exist? if not -> fallback to stop_id
valid_ids = set(df["stop_id"].astype(str))
bad_parent = ~df["parent_station"].astype(str).isin(valid_ids)
df.loc[bad_parent, "parent_station"] = df.loc[bad_parent, "stop_id"]

# more general grouping 
df["stop_num"] = df["stop_id"].str.extract(r"^U(\d+)[ZN]", expand=False)
df["stop_num"] = pd.to_numeric(df["stop_num"], errors="coerce").astype("Int64")

# only keeping useful columns
keep = [
    "stop_id",
    "stop_name",
    "stop_lat",
    "stop_lon",
    "zone_id",
    "location_type",
    "parent_station",
    "stop_num",
    "wheelchair_boarding",
    "platform_code",
]
keep = [c for c in keep if c in df.columns]
stops_transformed = df[keep].copy()

In [43]:
stops_transformed.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,parent_station,stop_num,wheelchair_boarding,platform_code
0,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A
1,U1Z2P,Boletická,50.133011,14.513584,P,0,U1Z2P,1,0,B
2,U3Z1P,Břetislavka,50.123241,14.389533,P,0,U3Z1P,3,0,A
3,U3Z1,Břetislavka,50.123241,14.389533,B,0,U3Z1,3,0,A
4,U3Z2P,Břetislavka,50.123013,14.389885,P,0,U3Z2P,3,0,B


In [46]:
df = data["transfers.txt"]

# changing type to numeric for purely numeric columns 
for c in ["transfer_type", "min_transfer_time", "max_waiting_time"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# keep only useful columns
keep = [
    "from_stop_id",
    "to_stop_id",
    "from_trip_id",
    "to_trip_id",
    "transfer_type",
]
keep = [c for c in keep if c in df.columns]
transfers_transformed = df[keep].copy()

In [47]:
transfers_transformed.head()

Unnamed: 0,from_stop_id,to_stop_id,from_trip_id,to_trip_id,transfer_type
0,U142Z301,U142Z301,,,2
1,U480Z301,U480Z301,,,2
2,U2816Z301,U2816Z301,,,2
3,U458Z301,U458Z301,,,2
4,U2236Z301,U2236Z301,,,2


In [76]:
df = data["trips.txt"]

# handling missing values
for c in ["trip_headsign", "trip_short_name", "shape_id"]:
    if c in df.columns:
        df[c] = df[c].fillna("").astype(str).str.strip()

# changing type to numeric for purely numeric columns 
for c in ["direction_id", "wheelchair_accessible", "bikes_allowed", "sub_agency_id", "exceptional"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

# only keeping useful columns
keep = [
    "route_id",
    "trip_id",
    "service_id",
    "direction_id",
    "sub_agency_id",
    "shape_id",
    "trip_headsign",
    "trip_short_name",
    "wheelchair_accessible",
    "bikes_allowed",
    "exceptional"
]
keep = [c for c in keep if c in df.columns]
trips_transformed = df[keep].copy()

In [77]:
trips_transformed.head()

Unnamed: 0,route_id,trip_id,service_id,direction_id,sub_agency_id,shape_id,trip_headsign,trip_short_name,wheelchair_accessible,bikes_allowed,exceptional
0,L991,991_9400_250901,1111100-1,0,1,L991V1,Nemocnice Motol,,1,1,0
1,L991,991_9401_250901,1111100-1,1,1,L991V2,Depo Hostivař,,1,1,0
2,L991,991_9402_250901,1111100-1,0,1,L991V3,Nemocnice Motol,,1,1,0
3,L991,991_9403_250901,1111100-1,1,1,L991V4,Skalka,,1,1,0
4,L991,991_9404_250901,1111100-1,0,1,L991V1,Nemocnice Motol,,1,1,0


-----
### route type for each dataset

(ignore this)

In [112]:
route_type_df = routes_transformed[["route_type", "route_id"]]

route_type_df = pd.merge(route_type_df, trips_transformed[['route_id', 'trip_id']], on='route_id')
route_type_df = pd.merge(route_type_df, stop_times_transformed[['stop_id', 'trip_id']], on='trip_id')
route_type_df = pd.merge(route_type_df, stops_transformed[['stop_id', 'stop_name']], on='stop_id')
route_type_df.head()

Unnamed: 0,route_type,route_id,trip_id,stop_id,stop_name
0,1,L991,991_9400_250901,U953Z102P,Skalka
1,1,L991,991_9400_250901,U713Z102P,Strašnická
2,1,L991,991_9400_250901,U921Z102P,Želivského
3,1,L991,991_9400_250901,U118Z102P,Flora
4,1,L991,991_9400_250901,U209Z102P,Jiřího z Poděbrad


In [133]:
route_type_df[route_type_df["stop_name"] == "Skalka"]

Unnamed: 0,route_type,route_id,trip_id,stop_id,stop_name
0,1,L991,991_9400_250901,U953Z102P,Skalka
31,1,L991,991_9401_250901,U953Z101P,Skalka
34,1,L991,991_9402_250901,U953Z102P,Skalka
65,1,L991,991_9403_250901,U953Z102P,Skalka
66,1,L991,991_9404_250901,U953Z102P,Skalka
...,...,...,...,...,...
909879,3,L906,906_10_251110,U953Z1P,Skalka
909917,3,L906,906_69_251029,U953Z1P,Skalka
909918,3,L906,906_3_251110,U953Z7P,Skalka
910018,3,L906,906_4_251110,U953Z1P,Skalka


In [96]:
trips_trans = pd.merge(trips_transformed, routes_transformed[["route_type", "route_id"]], on = "route_id")
trips_trans.head()

Unnamed: 0,route_id,trip_id,service_id,direction_id,sub_agency_id,shape_id,trip_headsign,trip_short_name,wheelchair_accessible,bikes_allowed,exceptional,route_type
0,L991,991_9400_250901,1111100-1,0,1,L991V1,Nemocnice Motol,,1,1,0,1
1,L991,991_9401_250901,1111100-1,1,1,L991V2,Depo Hostivař,,1,1,0,1
2,L991,991_9402_250901,1111100-1,0,1,L991V3,Nemocnice Motol,,1,1,0,1
3,L991,991_9403_250901,1111100-1,1,1,L991V4,Skalka,,1,1,0,1
4,L991,991_9404_250901,1111100-1,0,1,L991V1,Nemocnice Motol,,1,1,0,1


In [98]:
stop_times_trans = pd.merge(stop_times_transformed, trips_trans[["route_type", "trip_id"]], on = "trip_id")
stop_times_trans.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,arrival_time,departure_time,pickup_type,drop_off_type,route_type
0,991_9400_250901,U953Z102P,1,07:29:15,07:29:15,0,0,1
1,991_9400_250901,U713Z102P,2,07:31:15,07:31:35,0,0,1
2,991_9400_250901,U921Z102P,3,07:33:00,07:33:20,0,0,1
3,991_9400_250901,U118Z102P,4,07:34:35,07:34:55,0,0,1
4,991_9400_250901,U209Z102P,5,07:36:05,07:36:25,0,0,1


In [100]:
stop_trans = pd.merge(stops_transformed, stop_times_trans[["route_type", "stop_id"]], on = "stop_id")
stop_trans.head()

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,zone_id,location_type,parent_station,stop_num,wheelchair_boarding,platform_code,route_type
0,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A,3
1,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A,3
2,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A,3
3,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A,3
4,U1Z1P,Boletická,50.132732,14.513702,P,0,U1Z1P,1,0,A,3


### HIGH VISIBILITY

In [261]:
trips_count = pd.DataFrame(stop_times_transformed.groupby('stop_id')['trip_id'].nunique())
trips_count

Unnamed: 0_level_0,trip_id
stop_id,Unnamed: 1_level_1
T53041,94
T53047,444
T53051,32
T53068,44
T53097,61
...,...
U999Z2P,84
U99Z1P,738
U99Z2P,629
U9Z1,78


In [262]:
high_visibility = stops_transformed[["stop_id", "stop_name"]]

high_visibility_merged = pd.merge(high_visibility, trips_count, on='stop_id')
#high_visibility_merged = (high_visibility_merged.groupby(["stop_id", "stop_name"])["trip_id"].sum().sort_values(ascending=False).reset_index())

high_visibility_merged = high_visibility_merged.groupby("stop_name")["trip_id"].sum().sort_values(ascending=False).reset_index()
high_visibility_merged.head(10)

Unnamed: 0,stop_name,trip_id
0,Želivského,7086
1,Smíchovské nádraží,6986
2,Anděl,6954
3,Palmovka,5966
4,Kobylisy,5900
5,Lihovar,5891
6,Černý Most,5769
7,Karlovo náměstí,5504
8,Letňany,5392
9,Florenc,5204


In [134]:
## trying to sort it per route type ##
high = pd.merge(high_visibility_merged, route_type_df[["stop_name", "route_type"]], on = "stop_name")
high = high.drop_duplicates()
high.head(30)

Unnamed: 0,stop_name,trip_id,route_type
0,Želivského,7086,1
858,Želivského,7086,0
3867,Želivského,7086,3
7086,Smíchovské nádraží,6986,1
8055,Smíchovské nádraží,6986,0
8903,Smíchovské nádraží,6986,3
14072,Anděl,6954,1
15040,Anděl,6954,0
19365,Anděl,6954,3
21026,Palmovka,5966,1


### LONGEST EXPOSURE

#### 1. longest amount of time

In [146]:
from datetime import datetime

In [219]:
def time_to_seconds(t):
    h, m, s = map(int, t.split(':'))
    return h*3600 + m*60 + s

In [266]:
def seconds_to_time(seconds):
    # Convert to integer (in case it's float)
    seconds = int(seconds)
    
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    secs = seconds % 60
    
    # Format as HH:MM:SS with leading zeros
    return f"{hours:02d}:{minutes:02d}:{secs:02d}"

- groupby trip_id
- find the first and last row per unique trip_id
- convert to seconds
- find the difference
- average per route_id

In [263]:
## defining first rows and compusing seconds ##
first_rows = stop_times_transformed.loc[stop_times_transformed.groupby('trip_id')['arrival_time'].idxmin()]

first_rows = first_rows[["trip_id", "arrival_time"]]
first_rows['arrival_seconds'] = (
    first_rows['arrival_time']
    .apply(time_to_seconds)
)

In [264]:
## defining last rows and computing seconds ## 
last_rows  = stop_times_transformed.loc[stop_times_transformed.groupby('trip_id')['arrival_time'].idxmax()]

last_rows = last_rows[["trip_id", "arrival_time"]]
last_rows['arrival_seconds'] = (
    last_rows['arrival_time']
    .apply(time_to_seconds)
)

In [268]:
## computing duration per trip_id ##
duration_series = (last_rows.set_index("trip_id")["arrival_seconds"] - first_rows.set_index("trip_id")["arrival_seconds"])

duration = duration_series.reset_index()  # trip_id becomes a column
duration = duration.rename(columns={"arrival_seconds": "duration_seconds"})

duration = pd.merge(duration, trips_transformed[['trip_id', 'route_id']], on='trip_id')

In [270]:
## finding average per route ##
avg_duration_per_route = (duration.groupby('route_id')['duration_seconds'].mean().reset_index())

avg_duration_per_route['duration_hms'] = avg_duration_per_route['duration_seconds'].apply(seconds_to_time)
avg_duration_per_route = pd.merge(avg_duration_per_route, routes_transformed[['route_short_name', 'route_id']], on='route_id')

avg_duration_per_route = avg_duration_per_route.sort_values(
    by='duration_seconds',
    ascending=False
).reset_index(drop=True)

In [271]:
avg_duration_per_route.head()

Unnamed: 0,route_id,duration_seconds,duration_hms,route_short_name
0,L1106,9210.0,02:33:30,T6
1,L730,8796.0,02:26:36,730
2,L416,8700.0,02:25:00,416
3,L700,8592.857143,02:23:12,700
4,L1110,8145.0,02:15:45,T10


#### 2. highest amount of stops

In [294]:
## counting stops per trip ##
stops_counts = stop_times_transformed.groupby("trip_id")['stop_id'].count().reset_index(name='num_stops').sort_values(by='num_stops', ascending=False)

stops_counts = pd.merge(stops_counts, trips_transformed[['trip_id', 'route_id']], on='trip_id')

In [295]:
## computing average number of stops per route ##
avg_stops_per_route = (stops_counts.groupby('route_id')['num_stops'].mean().round(0).reset_index())

avg_stops_per_route = pd.merge(avg_stops_per_route, routes_transformed[['route_short_name', 'route_id']], on='route_id')
avg_stops_per_route = avg_stops_per_route.sort_values(
    by='num_stops',
    ascending=False
).reset_index(drop=True)

In [296]:
avg_stops_per_route.head()

Unnamed: 0,route_id,num_stops,route_short_name
0,L908,60.0,908
1,L910,58.0,910
2,L911,54.0,911
3,L913,52.0,913
4,L617,52.0,617


### HIGHEST REPETITION

In [300]:
repetition = pd.DataFrame(trips_transformed.groupby('route_id')['trip_id'].nunique())

In [301]:
repetition

Unnamed: 0_level_0,trip_id
route_id,Unnamed: 1_level_1
L1,356
L10,351
L100,162
L1004,52
L1005,2
...,...
L98,37
L99,21
L991,858
L992,969


In [312]:
high_repetition = routes_transformed[["route_id", "route_short_name"]]

high_repetition = pd.merge(high_repetition, repetition, on='route_id').sort_values(by='trip_id', ascending=False).reset_index(drop=True)
high_repetition.head(10)

Unnamed: 0,route_id,route_short_name,trip_id
0,L993,C,1022
1,L992,B,969
2,L812,X12,954
3,L991,A,858
4,L22,22,771
5,L17,17,717
6,L59,59,708
7,L9,9,699
8,L137,137,589
9,L5,5,581


## TO DO

1. __high visibility:__
- sort by route type -> when one stop has metro, tram, and bus, do we just say that it's metro
- in other words, do we sort by metro > tram > trolley > bus (?)

2. __longest exposure__
- do we sort by route type, since now trains have the longest duration
- or just make exclude regional routes outside of Prague city (or if the route is at night)
- for the most amount of stops, do we once again sort by route type, or not

3. __highest repetition__
- do we sort for weekdays, weekends, full weeks
- and do we sort by route type