In [3]:
import pandas as pd
import os
import re

In [4]:
folder_path = '/Users/nicholasfarkas/Downloads/Projects/DPD-Traffic-Safety/viz/traffic_volumes/data'

In [5]:
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

In [6]:
dfs = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    
    # Extract year from filename using regex
    # This pattern looks for 4 consecutive digits (assumed to be a year)
    year_match = re.search(r'(\d{4})', file)
    
    if year_match:
        year = int(year_match.group(1))
    else:
        # If no 4-digit year found, try to extract from end of filename
        # Remove .csv extension and look for numbers at the end
        filename_without_ext = os.path.splitext(file)[0]
        year_match = re.search(r'(\d+)$', filename_without_ext)
        
        if year_match:
            potential_year = int(year_match.group(1))
            # Validate that it's a reasonable year (between 1900 and 2100)
            if 1900 <= potential_year <= 2100:
                year = potential_year
            else:
                year = None
                print(f"Warning: Could not extract valid year from filename '{file}'")
        else:
            year = None
            print(f"Warning: Could not extract year from filename '{file}'")
    
    # Add year column to the dataframe
    df['year'] = year
    dfs.append(df)

In [7]:
# Concatenate all dataframes in the list
merged_df = pd.concat(dfs, ignore_index=True)

In [8]:
# Save the merged dataframe to a new csv
merged_df.to_csv('yearly_traffic_volume_2017-2023.csv', index=False)