In [1]:
import os
import glob
import re
from tqdm import tqdm
import pandas as pd
from easydict import EasyDict as edict

In [2]:
def analyze_city_data(input_folder):
    # 获取所有CSV文件路径
    csv_files = glob.glob(os.path.join(input_folder, "**/*.csv"), recursive=True)
    
    # 初始化结果表
    result_data = []
    
    # 处理每个CSV文件
    for csv_file in tqdm(csv_files, desc="Processing files"):
        try:
            basename = os.path.basename(csv_file)
            city_name = basename.split(".")[0]
            
            # 读取CSV文件（优化内存使用）
            city_df = pd.read_csv(csv_file, usecols=['脱敏ID'])  # 只读取必要列
            
            # 计算统计指标
            file_info = {
                'filename': basename,
                'city': city_name,
                'file_path': csv_file,
                'user_count': city_df['脱敏ID'].nunique(),  # 更高效的用户量计算
                'record_count': len(city_df),           # 总记录数
                'file_size_mb': os.path.getsize(csv_file) / (1024 * 1024)  # 文件大小(MB)
            }
            
            result_data.append(file_info)
            
        except Exception as e:
            print(f"Error processing {csv_file}: {str(e)}")
            continue
    
    # 创建结果DataFrame
    stats_df = pd.DataFrame(result_data)
    
    return stats_df

In [None]:
# 全国用户数据统计
areas = ["福州", "厦门", "泉州"]
dataset = "Timing"
month = 4
output_folder = "H:\全国城市\统计"
dates = [f"{month}_{day}" for day in range(3, 10)]  # 4月3日-4月9日
for area in areas:
    for date in dates:
        input_folder = rf"H:\全国城市\{area}\{dataset}\{month}月\Timing_{date}"
        city_stats = analyze_city_data(input_folder)
        # 保存结果
        output_path = os.path.join(output_folder, area, dataset, str(month))
        os.makedirs(output_path, exist_ok=True)
        output_file = os.path.join(output_path, f"stats_{date}.csv")
        city_stats.to_csv(output_file, index=False)
        print(f"结果已保存至: {output_file}")

Processing files:   2%|▏         | 6/371 [00:20<21:15,  3.49s/it]

In [7]:
output_path = os.path.join(output_folder, area, dataset, str(month))
os.makedirs(output_path, exist_ok=True)
output_file = os.path.join(output_path, f"stats_{date}.csv")
city_stats.to_csv(output_file, index=False)

In [4]:
city_stats

Unnamed: 0,filename,city,file_path,user_count,record_count,file_size_mb
0,七台河市.csv,七台河市,H:\全国城市\福州\Timing\4月\Timing_4_3\七台河市.csv,280,3476,0.256693
1,万宁市.csv,万宁市,H:\全国城市\福州\Timing\4月\Timing_4_3\万宁市.csv,1071,8037,0.597320
2,三亚市.csv,三亚市,H:\全国城市\福州\Timing\4月\Timing_4_3\三亚市.csv,5553,61098,4.541436
3,三明市.csv,三明市,H:\全国城市\福州\Timing\4月\Timing_4_3\三明市.csv,138929,1588931,118.102885
4,三沙市.csv,三沙市,H:\全国城市\福州\Timing\4月\Timing_4_3\三沙市.csv,47,251,0.018682
...,...,...,...,...,...,...
366,黔东南苗族侗族自治州.csv,黔东南苗族侗族自治州,H:\全国城市\福州\Timing\4月\Timing_4_3\黔东南苗族侗族自治州.csv,7790,81207,6.036062
367,黔南布依族苗族自治州.csv,黔南布依族苗族自治州,H:\全国城市\福州\Timing\4月\Timing_4_3\黔南布依族苗族自治州.csv,7265,71981,5.349883
368,黔西南布依族苗族自治州.csv,黔西南布依族苗族自治州,H:\全国城市\福州\Timing\4月\Timing_4_3\黔西南布依族苗族自治州.csv,4007,44935,3.340075
369,齐齐哈尔市.csv,齐齐哈尔市,H:\全国城市\福州\Timing\4月\Timing_4_3\齐齐哈尔市.csv,1800,21225,1.573330
