# Handling the calendar files from Inside Airbnb
> Very large files

In [1]:
import pandas as pd
import glob
import os
from datetime import datetime

def merge_and_filter_csv(input_path, output_file, start_date, end_date):
    # Convert date strings to datetime objects
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Get all CSV files in the input directory
    all_files = glob.glob(os.path.join(input_path, "*.csv"))

    # Sort files to ensure consistent ordering
    all_files.sort()
    print(f"Found {len(all_files)} CSV files")
    
    # Create an empty list to store the chunks
    chunks = []
    chunk_size = 100000  # Adjust this value based on your available memory

    # Process each file
    for filename in all_files:
        print(f"Processing {filename}")
        
        try:
            # Read the CSV file in chunks
            for chunk in pd.read_csv(filename, chunksize=chunk_size):
                # Convert 'date' column to datetime
                chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
                
                # Filter the chunk based on the date range
                filtered_chunk = chunk[(chunk['date'] >= start_date) & (chunk['date'] <= end_date)]
                
                if not filtered_chunk.empty:
                    # Append the filtered chunk to the list
                    chunks.append(filtered_chunk)
                
                # Write the chunk to the output file if the total size of chunks in memory exceeds 1GB
                if sum(c.memory_usage(deep=True).sum() for c in chunks) > 1e9:
                    pd.concat(chunks).to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))
                    chunks = []  # Clear the chunks list
        
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
            continue

    # Write any remaining chunks to the output file
    if chunks:
        pd.concat(chunks).to_csv(output_file, mode='a', index=False, header=not os.path.exists(output_file))

    print(f"Merged and filtered data saved to {output_file}")

# Example usage
input_path = 'raw_data/calendar (BIG)/'
output_file = 'raw_data/merged/calendar_filtered_plus_plus.csv'
start_date = '2024-01-01'
end_date = '2024-09-06'

merge_and_filter_csv(input_path, output_file, start_date, end_date)

Found 4 CSV files
Processing raw_data/calendar (BIG)\10th_june_calendar.csv
Processing raw_data/calendar (BIG)\12th_dec_calendar.csv
Processing raw_data/calendar (BIG)\16th_march_calendar.csv
Processing raw_data/calendar (BIG)\6th_sept_calendar.csv
Merged and filtered data saved to raw_data/merged/calendar_filtered_plus_plus.csv
