# Event Data

### Notebook Set Up

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt

### Read the data

In [22]:
import os

# Muestra la ruta actual de tu notebook
os.getcwd()


'/Users/nicoletondu/Desktop/data-science-thesis-2025/eda-notebooks'

In [None]:
events_df = pd.read_csv("../data/processed/raw_event_sessions_2023.csv")

In [30]:
events_df.tail()

Unnamed: 0,Fecha,Tipo,Evento/Festividad,Líneas afectadas (parseadas),Ubicación,Asistencia / Comentario,Estaciones afectadas / Notas TfL,Asistentes estimados,Día semana,Es fin de semana,...,Es festivo (UK),Es festival,Recinto (normalizado),Tipo de recinto,Capacidad típica recinto (aprox.),Afluencia esperada (categoría),Eventos el mismo día,Nº líneas afectadas,Impacto temporal,Códigos de Estaciones
76,2023-06-24,Evento deportivo,Wimbledon Championships,"District, Jubilee, Piccadilly, Victoria","All England Club, SW19",Grand Slam de tenis,"Wimbledon (District/National Rail), Southfield...",,Sábado,1,...,0,0,"All England Club, SW19",Otro,,Alta (30–60k),1.0,4.0,Entrada/Salida (según horario),"940GZZLUWIM, 940GZZLUSWM"
77,2023-06-30,Festival/Evento al aire libre,BST Hyde Park 2023,"Jubilee, Piccadilly, Victoria, Central","Hyde Park, Londres","Conciertos de verano, alta afluencia","Hyde Park Corner (Piccadilly), Marble Arch (Ce...",,Viernes,0,...,0,1,"Hyde Park, Londres",Parque,,Masiva (>60k),1.0,4.0,"Mixto (largo periodo, salidas PM)","940GZZLUHPC, 940GZZLUMBA, 940GZZLUGPK, 940GZZL..."
78,2023-07-01,Evento cultural,Pride in London 2023,"Central, Jubilee, Victoria, Piccadilly",Centro de Londres,"Más de 1 millón de personas, desfile, múltiple...","Estaciones centrales (Oxford Circus, Bond Stre...",1000000.0,Sábado,1,...,0,0,Centro de Londres,Otro,,Masiva (>60k),1.0,4.0,Entrada/Salida (según horario),"940GZZLUOXC, 940GZZLUBDS, 940GZZLUVIC, 940GZZL..."
79,2023-07-29,Evento deportivo,Formula E London ePrix,"Jubilee, DLR, Overground",Calles de Londres,"Carrera urbana, alta afluencia en zonas cercanas","Cierres parciales: Canary Wharf, Greenwich; es...",,Sábado,1,...,0,0,Calles de Londres,Otro,,Alta (30–60k),1.0,3.0,Entrada/Salida (según horario),"940GZZLUCFW, 910GGNWICH"
80,2023-12-31,Festivo,Año Nuevo (fuegos artificiales),"Jubilee, District, Circle",Centro de Londres,Alta afluencia en el centro para ver fuegos ar...,"Embankment, Westminster, Waterloo y Charing Cr...",,Domingo,1,...,1,0,Centro de Londres,Otro,,Masiva (>60k),1.0,3.0,"Mixto (largo periodo, salidas PM)","940GZZLUVX, 940GZZLUEMB, 940GZZLULSQ"


In [28]:
events_df.dtypes

Fecha                                 object
Tipo                                  object
Evento/Festividad                     object
Líneas afectadas (parseadas)          object
Ubicación                             object
Asistencia / Comentario               object
Estaciones afectadas / Notas TfL      object
Asistentes estimados                 float64
Día semana                            object
Es fin de semana                       int64
Mes                                   object
Estación                              object
Es festivo (UK)                        int64
Es festival                            int64
Recinto (normalizado)                 object
Tipo de recinto                       object
Capacidad típica recinto (aprox.)    float64
Afluencia esperada (categoría)        object
Eventos el mismo día                 float64
Nº líneas afectadas                  float64
Impacto temporal                      object
Códigos de Estaciones                 object
dtype: obj

In [32]:
### Data Cleaning and Transformation for Event Sessions Table

# Create a copy of the original dataframe for cleaning
events_clean = events_df.copy()

# 1. Convert Fecha to datetime and rename to date
events_clean['date'] = pd.to_datetime(events_clean['Fecha'])
events_clean = events_clean.drop('Fecha', axis=1)

# 2. Rename 'Tipo' to 'event_type'
events_clean = events_clean.rename(columns={'Tipo': 'event_type'})

# 3. Rename 'Evento/Festividad' to 'event_name'
events_clean = events_clean.rename(columns={'Evento/Festividad': 'event_name'})

# 4. Rename 'Asistentes estimados' to 'expected_attendance'
events_clean = events_clean.rename(columns={'Asistentes estimados': 'expected_attendance'})

# 5. Handle the affected lines transformation
# First, let's examine the current structure of the affected lines column
print("Sample of affected lines data:")
print(events_clean['Líneas afectadas (parseadas)'].head(10))
print("\nUnique values in affected lines:")
print(events_clean['Líneas afectadas (parseadas)'].value_counts())


Sample of affected lines data:
0                      N/A (Impacto general en la red)
1    Jubilee, Metropolitan, Bakerloo, Central, Over...
2                      N/A (Impacto general en la red)
3                      N/A (Impacto general en la red)
4               Piccadilly, Jubilee, Circle & District
5                      N/A (Impacto general en la red)
6                    Jubilee, Central, Overground, DLR
7    Jubilee, Metropolitan, Bakerloo, Central, Over...
8                           Central, Circle & District
9    Jubilee, Metropolitan, Bakerloo, Central, Over...
Name: Líneas afectadas (parseadas), dtype: object

Unique values in affected lines:
Líneas afectadas (parseadas)
Jubilee                                                                16
Jubilee, Metropolitan, Bakerloo, Central, Overground, National Rail    13
N/A (Impacto general en la red)                                         7
Victoria, Overground, National Rail                                     6
Piccadilly

In [34]:
# 6. Transform affected lines to one row per tube line
# This is the most complex transformation - we need to split comma-separated lines and create separate rows

def split_affected_lines(df):
    """
    Split the affected lines column and create one row per affected line
    """
    # Create a list to store the expanded rows
    expanded_rows = []
    
    for idx, row in df.iterrows():
        affected_lines = row['Líneas afectadas (parseadas)']
        
        # Handle different cases for affected lines
        if pd.isna(affected_lines) or affected_lines == 'N/A (Impacto general en la red)':
            # For general impact events, create a single row with 'General' as the line
            new_row = row.copy()
            new_row['affected_lines'] = 'General'
            expanded_rows.append(new_row)
        else:
            # Split by comma and clean up each line name
            lines = [line.strip() for line in str(affected_lines).split(',')]
            
            # Create a row for each affected line
            for line in lines:
                new_row = row.copy()
                new_row['affected_lines'] = line
                expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows)

# Apply the transformation
events_expanded = split_affected_lines(events_clean)

# Drop the original column
events_expanded = events_expanded.drop('Líneas afectadas (parseadas)', axis=1)

print("Original number of rows:", len(events_clean))
print("Expanded number of rows:", len(events_expanded))
print("\nSample of expanded data:")
print(events_expanded[['date', 'event_type', 'event_name', 'affected_lines', 'expected_attendance']].head(10))


Original number of rows: 81
Expanded number of rows: 230

Sample of expanded data:
        date        event_type                                  event_name  \
0 2023-01-02           Festivo                  New Year’s Day (sustituto)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
1 2023-04-06  Evento deportivo  Women's Finalissima (Inglaterra vs Brasil)   
2 2023-04-07           Festivo                                 Good Friday   
3 2023-04-10           Festivo                               Easter Monday   
4 2023-04-23  Evento deportivo                             London Marathon   

  affected_lines  expected_attendance  
0        General  

In [36]:
# 7. Create the final event sessions table with only the required columns
event_sessions = events_expanded[['date', 'event_type', 'event_name', 'affected_lines', 'expected_attendance']].copy()

# Sort by date for better organization
event_sessions = event_sessions.sort_values('date').reset_index(drop=True)

# Display summary information
print("Event Sessions Table Summary:")
print(f"Total rows: {len(event_sessions)}")
print(f"Date range: {event_sessions['date'].min()} to {event_sessions['date'].max()}")
print(f"Number of unique events: {event_sessions['event_name'].nunique()}")
print(f"Number of unique event types: {event_sessions['event_type'].nunique()}")
print(f"Number of unique affected lines: {event_sessions['affected_lines'].nunique()}")

print("\nEvent types distribution:")
print(event_sessions['event_type'].value_counts())

print("\nTop 10 most affected lines:")
print(event_sessions['affected_lines'].value_counts().head(10))

print("\nSample of final event sessions table:")
print(event_sessions.head(15))


Event Sessions Table Summary:
Total rows: 230
Date range: 2023-01-02 00:00:00 to 2023-12-31 00:00:00
Number of unique events: 59
Number of unique event types: 7
Number of unique affected lines: 15

Event types distribution:
event_type
Concierto masivo                 150
Evento deportivo                  26
Festivo                           18
Evento cultural                   15
Festival/Evento al aire libre     11
Evento deportivo/espectáculo       6
Evento deportivo/benéfico          4
Name: count, dtype: int64

Top 10 most affected lines:
affected_lines
Jubilee          45
Overground       28
Central          24
National Rail    23
Piccadilly       23
Victoria         21
Bakerloo         14
Metropolitan     14
District         14
General           7
Name: count, dtype: int64

Sample of final event sessions table:
         date        event_type                                  event_name  \
0  2023-01-02           Festivo                  New Year’s Day (sustituto)   
1  2023-02-14

In [38]:
# 8. Save the cleaned event sessions table
output_path = "../data/processed/event_sessions_2023.csv"
event_sessions.to_csv(output_path, index=False)

print(f"Event sessions table saved to: {output_path}")
print(f"File contains {len(event_sessions)} rows and {len(event_sessions.columns)} columns")

# Display the data types of the final table
print("\nData types:")
print(event_sessions.dtypes)

# Show a few examples of the transformed data
print("\nExamples of the transformed data:")
print("="*50)
for i, row in event_sessions.head(10).iterrows():
    print(f"Date: {row['date'].strftime('%Y-%m-%d')}, Event: {row['event_name']}, Type: {row['event_type']}, Line: {row['affected_lines']}, Attendance: {row['expected_attendance']}")


Event sessions table saved to: ../data/processed/event_sessions_2023.csv
File contains 230 rows and 5 columns

Data types:
date                   datetime64[ns]
event_type                     object
event_name                     object
affected_lines                 object
expected_attendance           float64
dtype: object

Examples of the transformed data:
Date: 2023-01-02, Event: New Year’s Day (sustituto), Type: Festivo, Line: General, Attendance: nan
Date: 2023-02-14, Event: Chris Brown - Under the Influence Tour, Type: Concierto masivo, Line: Jubilee, Attendance: nan
Date: 2023-02-15, Event: Chris Brown - Under the Influence Tour, Type: Concierto masivo, Line: Jubilee, Attendance: nan
Date: 2023-02-19, Event: Chris Brown - Under the Influence Tour, Type: Concierto masivo, Line: Jubilee, Attendance: nan
Date: 2023-02-20, Event: Chris Brown - Under the Influence Tour, Type: Concierto masivo, Line: Jubilee, Attendance: nan
Date: 2023-03-22, Event: King Gizzard & the Lizard Wizard, 

In [44]:
event_sessions.head(10)

Unnamed: 0,date,event_type,event_name,affected_lines,expected_attendance
0,2023-01-02,Festivo,New Year’s Day (sustituto),General,
1,2023-02-14,Concierto masivo,Chris Brown - Under the Influence Tour,Jubilee,
2,2023-02-15,Concierto masivo,Chris Brown - Under the Influence Tour,Jubilee,
3,2023-02-19,Concierto masivo,Chris Brown - Under the Influence Tour,Jubilee,
4,2023-02-20,Concierto masivo,Chris Brown - Under the Influence Tour,Jubilee,
5,2023-03-22,Concierto masivo,King Gizzard & the Lizard Wizard,Victoria,
6,2023-03-23,Concierto masivo,King Gizzard & the Lizard Wizard,Victoria,
7,2023-04-06,Evento deportivo,Women's Finalissima (Inglaterra vs Brasil),National Rail,83000.0
8,2023-04-06,Festivo,Semana Santa,Jubilee,
9,2023-04-06,Evento deportivo,Women's Finalissima (Inglaterra vs Brasil),Central,83000.0
