# Merge CSV Files in Each Event Directory

In [None]:
import os
import pandas as pd
import glob
import gzip
from pathlib import Path
import gc

In [None]:
data_dir = Path('./timeline')  
output_dir = Path('./timeline/combined')   

event_dirs = [d for d in data_dir.iterdir() if d.is_dir()]

print(f"Discover {len(event_dirs)} event directory。")
for event in event_dirs:
    print(event.name)

output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
columns_to_use = [
    "userid", "username", "acctdesc", "location", "following", "followers", "totaltweets",
    "usercreatedts", "tweetid", "tweetcreatedts", "retweetcount", "text", "hashtags",
    "language", "coordinates", "favorite_count", "is_retweet",
    "original_tweet_id", "original_tweet_userid", "original_tweet_username",
    "in_reply_to_status_id", "in_reply_to_user_id", "in_reply_to_screen_name",
    "is_quote_status", "quoted_status_id", "quoted_status_userid", "quoted_status_username",
    "extractedts"
]

for event_dir in event_dirs:
    print(f"\nProcessing event directory: {event_dir.name}")
    
    # Get all CSV files (including gzip-compressed CSVs) in the event directory
    csv_files = list(event_dir.glob('*.csv')) + list(event_dir.glob('*.csv.gz'))
    
    if not csv_files:
        print(f"  - No CSV files found in {event_dir.name}")
        continue
    
    dataframe_collection = []
    
    for csv_file in csv_files:
        try:
            if csv_file.suffix == '.gz':
                # Read gzip-compressed CSV file
                df = pd.read_csv(
                    csv_file,
                    compression='gzip',
                    encoding='utf-8',
                    usecols=columns_to_use,
                    engine='python',
                    quoting=1  # csv.QUOTE_ALL
                )
            else:
                # Read regular CSV file
                df = pd.read_csv(
                    csv_file,
                    encoding='utf-8',
                    usecols=columns_to_use,
                    engine='python',
                    quoting=1  # csv.QUOTE_ALL
                )
            
            dataframe_collection.append(df)
            print(f"  - Successfully read file: {csv_file.name}, containing {len(df)} tweets.")
        except Exception as e:
            print(f"  - Error reading file {csv_file.name}: {e}")
    
    if not dataframe_collection:
        print(f"  - No files were successfully read in {event_dir.name}")
        continue
    
    # Merge all DataFrames
    try:
        df_combined = pd.concat(dataframe_collection, axis=0, ignore_index=True)
        print(f"  - Combined DataFrame contains {len(df_combined)} tweets.")
    except Exception as e:
        print(f"  - Error merging DataFrames: {e}")
        continue
    
    # Convert timestamp fields
    df_combined['tweetcreatedts'] = pd.to_datetime(df_combined['tweetcreatedts'], errors='coerce')
    df_combined['extractedts'] = pd.to_datetime(df_combined['extractedts'], errors='coerce')
    
    # Sort by 'tweetcreatedts'
    df_combined.sort_values(by='tweetcreatedts', inplace=True)
    
    # Reset index
    df_combined.reset_index(inplace=True, drop=True)
    
    # Define output file path
    # Save as compressed CSV
    output_file = output_dir / f"{event_dir.name}_merged.csv.gz"  
    # Save as uncompressed CSV
    # output_file = output_dir / f"{event_dir.name}_merged.csv"
    
    try:
        df_combined.to_csv(output_file, index=False, compression='gzip')  # Save with gzip compression
        # df_combined.to_csv(output_file, index=False)  # Save without compression
        print(f"  - Successfully saved merged and sorted data to: {output_file}")
    except Exception as e:
        print(f"  - Error saving merged data to {output_file}: {e}")
    
    del df_combined
    gc.collect()

print("\nData processing for all events completed.")