In [1]:
import pandas as pd
from datetime import datetime, timedelta

# 设置header
new_headers = [
    'Timestamp', 'Station', 'District', 'Freeway', 'Direction',
    'Lane Type', 'Station Length', 'Samples', '% Observed',
    'Total Flow', 'Avg Occupancy', 'Avg Speed',
    'Lane 1 Samples', 'Lane 1 Flow', 'Lane 1 Avg Occ',
    'Lane 1 Avg Speed', 'Lane 1 Observed'
]

# 文件前缀和日期范围
file_prefix = 'd12_text_station_5min_'
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 2, 1)

# 创建一个空DataFrame来累积所有日期的数据
accumulated_data = pd.DataFrame()

# 生成日期列表
date_list = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

for date in date_list:
    file_suffix = date.strftime('%Y_%m_%d')
    file_path = f"{file_prefix}{file_suffix}.txt.gz"
    try:
        daily_data = pd.read_csv(
            file_path,
            compression='gzip',
            header=None,
            names=new_headers,
            usecols=range(len(new_headers))
        )
        filtered_data = daily_data[daily_data['Station'] == 1223083].copy()
        accumulated_data = pd.concat([accumulated_data, filtered_data], ignore_index=True)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# 对于原始数据集
filtered_data = accumulated_data[['Timestamp', 'Lane 1 Flow', 'Lane 1 Observed', '% Observed']].copy()
filtered_data.rename(columns={
    'Timestamp': '5 Minutes',
    'Lane 1 Flow': 'Lane 1 Flow (Veh/5 Minutes)',
    'Lane 1 Observed': '# Lane Points'
}, inplace=True)
filtered_data['5 Minutes'] = pd.to_datetime(filtered_data['5 Minutes'])
filtered_data['5 Minutes'] = filtered_data['5 Minutes'].dt.strftime('%m/%d/%Y %H:%M')

split_point = int(len(filtered_data) * 0.8)
train_data = filtered_data[:split_point]
test_data = filtered_data[split_point:]

train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)

# 对于SAEs数据集
saes_data = accumulated_data.drop(columns=['Station Length', 'Avg Speed', 'Lane 1 Avg Speed', 'Lane 1 Observed', '% Observed'])
saes_data.rename(columns={
    'Timestamp': '5 Minutes',
    'Lane 1 Flow': 'Lane 1 Flow (Veh/5 Minutes)'
}, inplace=True)
saes_data['5 Minutes'] = pd.to_datetime(saes_data['5 Minutes'])
saes_data['5 Minutes'] = saes_data['5 Minutes'].dt.strftime('%m/%d/%Y %H:%M')

split_point_saes = int(len(saes_data) * 0.8)
train_data_saes = saes_data[:split_point_saes]
test_data_saes = saes_data[split_point_saes:]

train_data_saes.to_csv('train_saes.csv', index=False)
test_data_saes.to_csv('test_saes.csv', index=False)
