In [22]:
import zipfile
import pandas as pd
import os
import re

def extract_exp_year(zip_file_path, year, output_file, header_written):
    """
    Extracts data from expXX.csv within a ZIP file for a given year.

    Args:
        zip_file_path (str): Path to the ZIP file.
        year (int): The year (e.g., 2000, 2001, ..., 2023).
        output_file (str): Path to the output CSV file.
        header_written (bool): Flag indicating if the header has been written.

    Returns:
        bool: Updated header_written status.
    """
    # 1. Construct the file name for expXX.csv
    year_str = f"{year % 100:02d}"
    exp_file_name = f"exp{year_str}.csv"
    print(f"  Looking for {exp_file_name} in ZIP")

    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zf:
            # 2. Search for the file within the ZIP archive
            exp_file_path_in_zip = None
            for name in zf.namelist():
                print(f"    Checking file: {name}")
                if name.endswith(exp_file_name):
                    exp_file_path_in_zip = name
                    break

            if exp_file_path_in_zip:
                print(f"  Found at {exp_file_path_in_zip}, Extracting and reading: {exp_file_name}")
                csv_file = zf.open(exp_file_path_in_zip)

                # 3. Extract quarter from the file name
                match = re.search(r"[a-z]{4}\d{2}([1-4])", exp_file_path_in_zip.split('/')[-1], re.IGNORECASE)  # Extract the quarter number
                if match:
                    quarter = int(match.group(1))
                    print(f"    Extracted quarter: {quarter}")  # Added print
                else:
                    quarter = None
                    print(f"    Quarter not found in filename")  # Added print
                # Use chunksize to process in smaller parts
                for chunk in pd.read_csv(csv_file, low_memory=False, chunksize=100000):
                    chunk['year'] = year
                    chunk['quarter'] = quarter  # Add the quarter
                    # Append each chunk to the output file
                    chunk.to_csv(output_file, mode='a', header=not header_written, index=False)
                    header_written = True  # Ensure header is written only once
                    print(f"    Chunk written to {output_file}")
            else:
                print(f"  File not found in ZIP: {exp_file_name}")
    except Exception as e:
        print(f"  Error processing {exp_file_name} from ZIP: {e}")
    return header_written



# 1. Set the path to your folder containing the ZIP files
folder_path = r"C:\Users\omyue\OneDrive\Desktop\DATA 205\U.S. Bureau of Labor Statistics – Consumer Expenditure Surveys (CE)\interview_zip"

# 2. Get a list of all ZIP files in the folder
zip_files = [f for f in os.listdir(folder_path) if f.startswith("intrvw") and f.endswith(".zip")]
print(f"ZIP files found: {zip_files}")

# 3. Define the output file
output_file = "combined_interview_data.csv"
header_written = False

# 4. Loop through each ZIP file
for zip_file in zip_files:
    file_path = os.path.join(folder_path, zip_file)
    print(f"Processing ZIP file: {zip_file}")
    # Extract year from the ZIP file name
    if zip_file.startswith("intrvw") and zip_file.endswith(".zip"):
        match = re.search(r"intrvw(\d{2})\.zip", zip_file)
        if match:
            year_str = match.group(1)
            print(f"  Attempting to convert year_str: {year_str}")
            try:
                year = int(year_str) + 2000 if int(year_str) < 50 else int(year_str) + 1900
                print(f"  Year: {year}")
                header_written = extract_exp_year(file_path, year, output_file, header_written) # Changed to extract_exp_year
            except ValueError:
                print(f"  Skipping ZIP file {zip_file} because the year cannot be parsed.")
                continue
        else:
            print(f"  Skipping ZIP file {zip_file} because it does not match the expected format (intrvwXX.zip).")
    else:
        print(f"  Skipping ZIP file {zip_file} because it does not match the expected format (intrvwXX.zip).")

# 5. Change the current working directory
output_dir = r"C:\Users\omyue\OneDrive\Desktop\DATA 205\U.S. Bureau of Labor Statistics – Consumer Expenditure Surveys (CE)"
os.chdir(output_dir)
print(f"Current working directory changed to: {os.getcwd()}")

print(f"Successfully combined data to {output_file}")
print(f"The output file is located in: {os.getcwd()}")


ZIP files found: ['intrvw13.zip', 'intrvw14.zip', 'intrvw15.zip', 'intrvw16.zip', 'intrvw17.zip', 'intrvw18.zip', 'intrvw19.zip', 'intrvw20.zip', 'intrvw21.zip', 'intrvw22.zip', 'intrvw23.zip']
Processing ZIP file: intrvw13.zip
  Attempting to convert year_str: 13
  Year: 2013
  Looking for exp13.csv in ZIP
    Checking file: intrvw13/fmli131x.csv
    Checking file: intrvw13/fmli132.csv
    Checking file: intrvw13/fmli133.csv
    Checking file: intrvw13/fmli134.csv
    Checking file: intrvw13/fmli141.csv
    Checking file: intrvw13/itbi131x.csv
    Checking file: intrvw13/itbi132.csv
    Checking file: intrvw13/itbi133.csv
    Checking file: intrvw13/itbi134.csv
    Checking file: intrvw13/itbi141.csv
    Checking file: intrvw13/itii131x.csv
    Checking file: intrvw13/itii132.csv
    Checking file: intrvw13/itii133.csv
    Checking file: intrvw13/itii134.csv
    Checking file: intrvw13/itii141.csv
    Checking file: intrvw13/memi131x.csv
    Checking file: intrvw13/memi132.csv
    Che