In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [3]:
data2001 = pd.read_csv('./pems10/d10_text_station_5min_2018_02_28.txt.gz',header=None, usecols=range(12))
# Assign column names based on the provided headers
column_names = [
    "Timestamp", "Station", "District", "Freeway #", 
    "Direction of Travel", "Lane Type", "Station Length", 
    "Samples", "% Observed", "Total Flow", "Avg Occupancy", "Avg Speed"
]

# Assign column names to the dataframe
data2001.columns = column_names
data2001

Unnamed: 0,Timestamp,Station,District,Freeway #,Direction of Travel,Lane Type,Station Length,Samples,% Observed,Total Flow,Avg Occupancy,Avg Speed
0,02/28/2018 00:00:00,1000110,10,5,S,ML,0.554,2,67,6.0,0.0045,64.2
1,02/28/2018 00:00:00,1000210,10,5,S,ML,0.478,3,67,4.0,0.0036,64.0
2,02/28/2018 00:00:00,1000310,10,5,S,ML,0.389,1,33,10.0,0.0092,62.0
3,02/28/2018 00:00:00,1000410,10,5,S,ML,0.397,0,0,18.0,0.0056,67.4
4,02/28/2018 00:00:00,1000510,10,5,S,ML,0.474,11,33,14.0,0.0082,65.4
...,...,...,...,...,...,...,...,...,...,...,...,...
334075,02/28/2018 23:55:00,10126210,10,4,E,ML,0.471,27,100,49.0,0.0143,68.0
334076,02/28/2018 23:55:00,10126310,10,4,W,ML,0.285,30,100,46.0,0.0152,63.2
334077,02/28/2018 23:55:00,10126410,10,4,E,ML,0.481,24,100,57.0,0.0174,67.1
334078,02/28/2018 23:55:00,10126510,10,4,W,ML,2.545,30,100,50.0,0.0161,63.5


In [4]:
comm = np.load('pems10_comm.npy')
comm

array([1000110, 1000210, 1000310, 1000410, 1000510, 1000610, 1000710,
       1000810, 1000910, 1001010, 1001110, 1001210, 1001310, 1001410,
       1001510, 1001610, 1001710, 1001810, 1001910, 1002010, 1002110,
       1002210, 1002310, 1002410, 1002510, 1002610, 1002710, 1002810,
       1002910, 1003010, 1003110, 1003210, 1003310, 1003410, 1003510,
       1003610, 1003710, 1003810, 1003910, 1004010, 1004110, 1004210,
       1004310, 1004410, 1004510, 1005010, 1005110, 1005210, 1005310,
       1005410, 1005510, 1005810, 1005910, 1006210, 1006310, 1006410,
       1006510, 1006610, 1006710, 1007010, 1007110, 1007510, 1007810,
       1007910, 1008010, 1008110, 1008210, 1008310, 1008410, 1008510,
       1008810, 1008910, 1009010, 1009110, 1009410, 1009510, 1009610,
       1009710, 1009810, 1009910, 1010210, 1010310, 1010610, 1010710,
       1010810, 1010910, 1011410, 1011510, 1011610, 1011710, 1012310,
       1012810, 1012910, 1013110, 1013210, 1013310, 1013410, 1013510,
       1013610, 1013

In [6]:
import pandas as pd
import calendar
from datetime import date, timedelta

def read_and_process_data(file_path):
    try:
        # Read the CSV file, specifying the usecols parameter to only load the columns of interest
        data = pd.read_csv(
            file_path, 
            header=None, 
            usecols=[0, 1, 9],  # Column indexes for Timestamp, Station, and Total Flow
            names=['Timestamp', 'Station', 'Avg Flow'],  # Assigning column names
            compression='gzip'
        )

        # Convert the Timestamp column to datetime format and set it as the index
        data['Timestamp'] = pd.to_datetime(data['Timestamp'])
        data.set_index('Timestamp', inplace=True)

        # Pivot the table to get Stations as columns and Total Flow as cell values
        pivot_data = data.pivot(columns='Station', values='Avg Flow')

        return pivot_data

    except EOFError:
        print(f"Error processing file: {file_path}")
        return None

# Read the data for February 28 to get the column names (Station IDs)
# feb_28_data = read_and_process_data('./pems03/d03_text_station_5min_2018_02_28.txt.gz')
# feb_28_columns = feb_28_data.columns

# Initialize a list to hold the data for all days
all_data = []

start_date = date(2007, 6, 27)  # 开始日期
# end_date = date(2012, 10, 2)   # 结束日期
end_date = date(2024, 3, 20)   # 结束日期
current_date = start_date
all_data = []                  # 存储所有数据的列表

while current_date <= end_date:
    year = current_date.year
    month = current_date.month
    day = current_date.day
    file_path = f'./pems10/d10_text_station_5min_{year}_{month:02d}_{day:02d}.txt.gz'
#     if year==2012 and month==10 and day==1:
#         continue
    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}, skipping...")
        current_date += timedelta(days=1)
        continue
    
    day_data = read_and_process_data(file_path)
    
    # 使用字典存储新列，以便一次性添加到数据帧
    new_columns = {}
    for col in comm:
        if col not in day_data:
            new_columns[col] = 0  # 假设使用0填充缺失的列

    # 使用 pd.concat 添加所有新列
    if new_columns:
        new_data = pd.DataFrame(new_columns, index=day_data.index)
        day_data = pd.concat([day_data, new_data], axis=1)

    # 将列重新排序以匹配2月28日的顺序
    day_data = day_data[comm]
    
    all_data.append(day_data)
    
    current_date += timedelta(days=1)

# Concatenate all daily data into a single DataFrame
combined_data = pd.concat(all_data)

# Fill any remaining missing values with 0
combined_data.fillna(0, inplace=True)

# The combined_data DataFrame now holds the merged data for January 1 to February 28
# with columns aligned to February 28 and missing values filled with 0
# Note: The code execution is commented out to prevent execution in this environment.
# combined_data.head()
combined_data.to_csv('pems10_all_common_flow.csv')

In [None]:
combined_data

In [1]:
import pandas as pd
pems10 = pd.read_csv('pems10_all_common_flow.csv')

# Convert the 'date' column to datetime
pems10['date'] = pd.to_datetime(pems10['date'])

# Set the 'date' column as the index
pems10.set_index('date', inplace=True)

# Resample to hourly data and aggregate using sum
pems10_hourly = pems10.resample('H').sum()

# Reset the index if you want the 'date' column back
pems10_hourly.reset_index(inplace=True)
pems10_hourly.to_csv('pems10_h.csv',index=False)


# Convert the 'date' column to datetime
pems10 = pd.read_csv('pems10_all_common_flow.csv')
pems10['date'] = pd.to_datetime(pems10['date'])

# Set the 'date' column as the index
pems10.set_index('date', inplace=True)

# Resample to hourly data and aggregate using sum
pems10_daily = pems10.resample('D').sum()

# Reset the index if you want the 'date' column back
pems10_daily.reset_index(inplace=True)
pems10_daily.to_csv('pems10_d.csv',index=False)

  pems10_hourly = pems10.resample('H').sum()
