In [1]:
import os
import pandas as pd
import json

In [2]:
def merge_json_files(data_folder="data", main_file="li_jobs.json"):
    """
    Merges all JSON files from subfolders in the data folder into the main li_jobs.json file.
    
    Parameters:
    - data_folder (str): The root folder containing the data (default: 'data').
    - main_file (str): The name of the main JSON file to update (default: 'li_jobs.json').
    """
    main_file_path = os.path.join(data_folder, main_file)
    
    print(main_file_path)
    
    main_data = []
    
    # Walk through the data folder and its subfolders
    for root, _, files in os.walk(data_folder):
        for file in files:
            # Process JSON files 
            if file.endswith(".json") and file != main_file:
                file_path = os.path.join(root, file)
                print(f"Processing file: {file_path}")
                
                # Load the JSON file
                try:
                    sub_df = pd.read_json(file_path)
                    sub_data = sub_df.to_dict(orient="records")
                    # Append the data to the main list
                    main_data.extend(sub_data)
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    # Convert back to DataFrame to handle any potential formatting
    merged_df = pd.DataFrame(main_data)
    
    # Save the merged data back to the main file
    merged_df.to_json(main_file_path, orient="records", indent=2, force_ascii=False)
    print(f"Merged data saved to {main_file_path}. Total records: {len(merged_df)}")
    return

In [3]:
def remove_duplicates_by_poster_id(data_folder="data", main_file="li_jobs.json"):
    """
    Removes duplicate entries in li_jobs.json based on the 'Poster Id' column.
    
    Parameters:
    - data_folder (str): The root folder containing the data (default: 'data').
    - main_file (str): The name of the main JSON file to clean (default: 'li_jobs.json').
    """
    main_file_path = os.path.join(data_folder, main_file)
    
    # Check if the main file exists
    if not os.path.exists(main_file_path):
        print(f"Main file {main_file_path} not found.")
        return
    
    # Load the main file
    df = pd.read_json(main_file_path)
    initial_count = len(df)
    print(f"Initial number of records: {initial_count}")
    
    # Remove duplicates based on 'Poster Id', keeping the first occurrence
    df_cleaned = df.drop_duplicates(subset="Poster Id", keep="first")
    final_count = len(df_cleaned)
    print(f"Number of records after removing duplicates: {final_count}")
    print(f"Removed {initial_count - final_count} duplicate records.")
    
    # Save the cleaned data back to the main file
    df_cleaned.to_json(main_file_path, orient="records", indent=2)
    print(f"Cleaned data saved to {main_file_path}")
    return

In [4]:
merge_json_files()

data\li_jobs.json
Processing file: data\04-03\jobsDetails_20250304.json
Processing file: data\31-03\jobsDetails_20250331200718.json
Processing file: data\31-03\jobsDetails_20250331202803.json
Processing file: data\31-03\jobsDetails_20250331203727.json
Processing file: data\31-03\jobsDetails_20250331204209.json
Processing file: data\31-03\jobsDetails_20250331205341.json
Processing file: data\31-03\jobsDetails_20250331210404.json
Processing file: data\31-03\jobsDetails_20250331211440.json
Merged data saved to data\li_jobs.json. Total records: 6547


In [5]:
remove_duplicates_by_poster_id()

Initial number of records: 6547
Number of records after removing duplicates: 2086
Removed 4461 duplicate records.
Cleaned data saved to data\li_jobs.json
