In [3]:
import os
import zipfile
import pandas as pd

dataset_folder = '/content/Dataset'  # this path consist all .zip files of particular year
output_folder = 'output'    # path where merged file is stored
output_file = 'merged_Jan_2024.csv'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# list to store DataFrames
df_list = []

# Iterate over all .zip files in the dataset folder
for filename in os.listdir(dataset_folder):
    if filename.endswith('.zip'):
        zip_path = os.path.join(dataset_folder, filename)
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Assuming each zip file contains one CSV file
            for file in zip_ref.namelist():
                if file.endswith('.csv'):
                    # Read the CSV file directly from the zip file
                    with zip_ref.open(file) as csv_file:
                        df = pd.read_csv(csv_file)
                        df_list.append(df)
                    break  # Assuming only one CSV per zip file
            else:
                print(f"No CSV file found in {filename}")

# Concatenate all DataFrames
if df_list:
    merged_df = pd.concat(df_list, ignore_index=True)

    # Save the concatenated DataFrame to the output folder
    output_path = os.path.join(output_folder, output_file)
    merged_df.to_csv(output_path, index=False)

    print(f"All CSV files have been merged and saved to {output_path}")
else:
    print("No CSV files were found to merge.")


All CSV files have been merged and saved to output/merged_2024.csv


**Processing:**

The script iterates over each .zip file in the Dataset folder.

It extracts the .csv file inside each zip without extracting the entire zip file to disk.

Each CSV is read into a pandas DataFrame and appended to a list.

All DataFrames in the list are concatenated into a single DataFrame.

The merged DataFrame is saved as merged.csv in the output folder.