In [1]:
import os
import glob
import re
from tqdm import tqdm
import pandas as pd
from easydict import EasyDict as edict

In [3]:
def analyze_city_data(input_folder):
    # 获取所有CSV文件路径
    csv_files = glob.glob(os.path.join(input_folder, "**/*.csv"), recursive=True)
    
    # 初始化结果表
    result_data = []
    
    # 处理每个CSV文件
    for csv_file in tqdm(csv_files, desc="Processing files"):
        try:
            basename = os.path.basename(csv_file)
            city_name = basename.split(".")[0]
            
            # 读取CSV文件（优化内存使用）
            city_df = pd.read_csv(csv_file, usecols=['脱敏ID'])  # 只读取必要列
            
            # 计算统计指标
            file_info = {
                'filename': basename,
                'city': city_name,
                'file_path': csv_file,
                'user_count': city_df['脱敏ID'].nunique(),  # 更高效的用户量计算
                'record_count': len(city_df),           # 总记录数
                'file_size_mb': os.path.getsize(csv_file) / (1024 * 1024)  # 文件大小(MB)
            }
            result_data.append(file_info)
            
        except Exception as e:
            print(f"Error processing {csv_file}: {str(e)}")
            continue
    
    # 创建结果DataFrame
    stats_df = pd.DataFrame(result_data)
    
    return stats_df

In [5]:
# 全国用户数据统计
areas = ["福州", "厦门", "泉州"]
areas = ["六城市"]
dataset = "Timing"
month = 9
output_folder = "H:\全国城市\统计"
dates = [f"{month}_{day}" for day in range(3, 10)]  # 4月3日-4月9日
dates = ["9_25"]
for area in areas:
    for date in dates:
        # input_folder = rf"H:\全国城市\{area}\{dataset}\{month}月\Timing_{date}"
        input_folder = r"H:\全国城市\合并\6城市\9月"
        city_stats = analyze_city_data(input_folder)
        # 保存结果
        output_path = os.path.join(output_folder, area, dataset, str(month))
        os.makedirs(output_path, exist_ok=True)
        output_file = os.path.join(output_path, f"stats_{date}.csv")
        city_stats.to_csv(output_file, index=False)
        print(f"结果已保存至: {output_file}")

Processing files: 100%|██████████| 371/371 [16:18<00:00,  2.64s/it]  

结果已保存至: H:\全国城市\统计\六城市\Timing\9\stats_9_25.csv



