## Turning the rdf files into csv files

In [1]:
#importing the required libraries
import pandas as pd
import os
import glob

#setting the working directory
data = "C:/Users/hp/Desktop/monday_chall/Data"
rdf_path = r"C:/Users/hp/Desktop/monday_chall/Data/rdf_CTA__Ridership__Daily_by_Route_routes_2001_2025"

#reading the files
rdf_files = glob.glob(os.path.join(rdf_path, "*.rdf"))


**turning the rdf files into csv files**

In [None]:
import xml.etree.ElementTree as ET
from tqdm import tqdm

# Define namespaces used in the RDF files
namespaces = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'ds': 'https://data.cityofchicago.org/resource/jyb9-n7fm/',
    'socrata': 'http://www.socrata.com/rdf/terms#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'
}

def parse_rdf_file(file_path):
    """Parse a single RDF file and extract ridership data."""
    records = []
    try:
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        # Find all Description elements (each represents one record)
        for desc in root.findall('rdf:Description', namespaces):
            record = {}
            
            # Extract route
            route = desc.find('ds:route', namespaces)
            record['route'] = route.text if route is not None else None
            
            # Extract date
            date = desc.find('ds:date', namespaces)
            record['date'] = date.text if date is not None else None
            
            # Extract daytype
            daytype = desc.find('ds:daytype', namespaces)
            record['daytype'] = daytype.text if daytype is not None else None
            
            # Extract rides
            rides = desc.find('ds:rides', namespaces)
            record['rides'] = int(rides.text) if rides is not None else None
            
            records.append(record)
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
    
    return records

# Parse all RDF files and combine into a single list
all_records = []

print(f"Found {len(rdf_files)} RDF files to process")

for rdf_file in tqdm(rdf_files, desc="Processing RDF files"):
    records = parse_rdf_file(rdf_file)
    all_records.extend(records)
    
print(f"Total records extracted: {len(all_records)}")

# Create a DataFrame from all records
df = pd.DataFrame(all_records)

# Clean up the date column (remove the time portion)
df['date'] = pd.to_datetime(df['date']).dt.date

# Display
print("\nSample data:")
display(df.head())
print(f"\nDataFrame shape: {df.shape}")

# to CSV
output_file = os.path.join(data, "CTA_Ridership_Daily_by_Route_2001_2025.csv")
df.to_csv(output_file, index=False)
print(f"\nData saved to: {output_file}")

Found 19 RDF files to process


Processing RDF files: 100%|██████████| 19/19 [00:28<00:00,  1.50s/it]


Total records extracted: 1092474

Sample data:


Unnamed: 0,route,date,daytype,rides
0,1,2004-06-16,W,3174
1,100,2021-06-29,W,176
2,100,2018-05-21,W,497
3,106,2009-02-25,W,3146
4,100,2021-12-30,W,204



DataFrame shape: (1092474, 4)

Data saved to: C:/Users/hp/Desktop/monday_chall/Data\CTA_Ridership_Daily_by_Route_2001_2025.csv


now we have the files let's see what we have