In [1]:
# Import Libraries
import json
import os
import shutil
import pandas as pd

In [2]:
# Define the base working directory
directory = '/Volumes/Upwork/Upwork/Big_ticket/Data/Raw_data/'

In [3]:
# Define the base working directory
base_directory = os.path.join(directory, 'Professional_Cameras')

# Define the folder paths relative to the base directory
input_folder_path = os.path.join(base_directory, 'raw')
output_folder_path = os.path.join(base_directory, 'output_jsons')

# Create the output directory if it doesn't exist
os.makedirs(output_folder_path, exist_ok=True)

# Initialize counters
total_txt_files_processed = 0
total_json_files_created = 0

# Iterate through all files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.txt'):  # Process only text files
        total_txt_files_processed += 1
        file_path = os.path.join(input_folder_path, file_name)
        
        # Initialize a list to store JSON objects
        json_objects = []

        # Read and process the file
        with open(file_path, 'r') as file:
            for line in file:
                # Attempt to parse each line as JSON
                try:
                    json_object = json.loads(line)
                    json_objects.append(json_object)
                except json.JSONDecodeError:
                    # Handle or log the error if necessary
                    pass

        # Save each JSON object to a separate file with a global counter
        for json_object in json_objects:
            output_file_path = os.path.join(output_folder_path, f'json_object_{total_json_files_created + 1}.json')
            with open(output_file_path, 'w') as outfile:
                json.dump(json_object, outfile)
            
            # Increment the JSON files counter
            total_json_files_created += 1

# Output the summary of processing
print(f'Total number of .txt files processed: {total_txt_files_processed}')
print(f'Total number of JSON files created: {total_json_files_created}')

Total number of .txt files processed: 2
Total number of JSON files created: 376


In [4]:
# Renaming Files

# Define the output folder path where the JSON files are currently stored
output_folder_path = os.path.join(base_directory, 'output_jsons')

# Define the new folder path where renamed JSON files will be stored
renamed_folder_path = os.path.join(base_directory, 'renamed_jsons')

# Create the new directory if it doesn't exist
os.makedirs(renamed_folder_path, exist_ok=True)

# Counters for renaming files based on identifiers
rename_counts = {
    'asins': 0,
    'ean13': 0,
    'upca': 0,
    'id': 0,
}

# Total files processed
total_files = 0

# Iterate through all files in the output folder
for file_name in os.listdir(output_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(output_folder_path, file_name)
        
        # Read and process the JSON file
        with open(file_path, 'r') as file:
            try:
                json_object = json.load(file)

                # Determine the new file name based on the available identifier
                new_file_name = None

                if 'asins' in json_object and json_object['asins']:
                    new_file_name = json_object['asins'] if isinstance(json_object['asins'], str) else json_object['asins'][0]
                    rename_counts['asins'] += 1
                elif 'ean13' in json_object and json_object['ean13']:
                    new_file_name = json_object['ean13']
                    rename_counts['ean13'] += 1
                elif 'upca' in json_object and json_object['upca']:
                    new_file_name = json_object['upca']
                    rename_counts['upca'] += 1
                elif 'id' in json_object and json_object['id']:
                    new_file_name = json_object['id']
                    rename_counts['id'] += 1

                # Copy and rename the file to the new directory if a new name was determined
                if new_file_name:
                    new_file_path = os.path.join(renamed_folder_path, f'{new_file_name}.json')
                    shutil.copy(file_path, new_file_path)
                    total_files += 1

            except json.JSONDecodeError:
                # Handle or log the error if necessary
                pass

# Print the results
print(f'Total files processed: {total_files}')
print(f'Files renamed with "asins": {rename_counts["asins"]}')
print(f'Files renamed with "ean13": {rename_counts["ean13"]}')
print(f'Files renamed with "upca": {rename_counts["upca"]}')
print(f'Files renamed with "id": {rename_counts["id"]}')
print(f'Files remaining in the original folder: {len(os.listdir(output_folder_path))}')
print(f'Files in the new folder: {len(os.listdir(renamed_folder_path))}')

Total files processed: 376
Files renamed with "asins": 21
Files renamed with "ean13": 306
Files renamed with "upca": 0
Files renamed with "id": 49
Files remaining in the original folder: 376
Files in the new folder: 276


In [5]:
# Check for Pricing information if that is present 

# Define the output folder path where the JSON files are currently stored
output_folder_path = os.path.join(base_directory, 'renamed_jsons')

# List of keys to check in the JSON objects
keys_to_check = [
    'mostRecentPriceAmount',
    'mostRecentPriceAvailability',
    'mostRecentPriceByDomain',
    'mostRecentPriceColor',
    'mostRecentPriceCondition',
    'mostRecentPriceCurrency',
    'mostRecentPriceDomain',
    'mostRecentPriceFirstDateSeen',
    'mostRecentPriceIsSale',
    'mostRecentPriceSourceURL',
    'prices'
]

# Initialize counters
files_with_all_keys = 0
files_with_at_least_one_key = 0
files_with_no_keys = 0
key_counts = {key: 0 for key in keys_to_check}  # Dictionary to count each key's occurrences

# Iterate through all files in the output folder
for file_name in os.listdir(output_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(output_folder_path, file_name)
        
        # Read and process the JSON file
        with open(file_path, 'r') as file:
            try:
                json_object = json.load(file)
                
                # Check if the JSON object has all of the specified keys
                if all(key in json_object for key in keys_to_check):
                    files_with_all_keys += 1
                    # Also count occurrences of each key
                    for key in keys_to_check:
                        key_counts[key] += 1
                else:
                    # Check if at least one key is present
                    found_key = False
                    for key in keys_to_check:
                        if key in json_object:
                            key_counts[key] += 1
                            found_key = True
                    if found_key:
                        files_with_at_least_one_key += 1
                    else:
                        files_with_no_keys += 1

            except json.JSONDecodeError:
                # Handle or log the error if necessary
                pass

# Calculate total files processed
total_files = files_with_all_keys + files_with_at_least_one_key + files_with_no_keys

# Determine the most common key
most_common_key = max(key_counts, key=key_counts.get)
most_common_key_count = key_counts[most_common_key]

# Print the results
print(f'Total files processed: {total_files}')
print(f'Files with all specified keys: {files_with_all_keys}')
print(f'Files with at least one specified key (but not all): {files_with_at_least_one_key}')
print(f'Files with none of the specified keys: {files_with_no_keys}')

# Print the count of files containing each specific key
print('Number of files containing each specified key:')
for key, count in key_counts.items():
    print(f'  {key}: {count}')


Total files processed: 276
Files with all specified keys: 4
Files with at least one specified key (but not all): 272
Files with none of the specified keys: 0
Number of files containing each specified key:
  mostRecentPriceAmount: 178
  mostRecentPriceAvailability: 65
  mostRecentPriceByDomain: 26
  mostRecentPriceColor: 18
  mostRecentPriceCondition: 81
  mostRecentPriceCurrency: 178
  mostRecentPriceDomain: 176
  mostRecentPriceFirstDateSeen: 176
  mostRecentPriceIsSale: 35
  mostRecentPriceSourceURL: 26
  prices: 276


In [6]:
# Define the input and output folder paths relative to the base directory
input_folder_path = os.path.join(base_directory, 'renamed_jsons')

# Define paths for in_stock and out_stock folders
in_stock_folder_path = os.path.join(base_directory, 'in_stock')
out_stock_folder_path = os.path.join(base_directory, 'out_stock')

# Create the in_stock and out_stock directories if they don't exist
os.makedirs(in_stock_folder_path, exist_ok=True)
os.makedirs(out_stock_folder_path, exist_ok=True)

# Initialize counters for in_stock and out_stock files
in_stock_count = 0
out_stock_count = 0

# Iterate through all JSON files in the input folder
for file_name in os.listdir(input_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(input_folder_path, file_name)
        
        # Load the JSON object from the file
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        
        # Check if the key 'mostRecentPriceAvailability' exists
        if 'mostRecentPriceAvailability' in json_data:
            # Get the 'id' value to rename the file
            file_id = json_data.get('id', None)
            
            # Define the new file name and path for in_stock
            new_file_name = f"{file_id}.json" if file_id else file_name
            new_file_path = os.path.join(in_stock_folder_path, new_file_name)
            
            # Save the file in the in_stock folder
            with open(new_file_path, 'w') as new_file:
                json.dump(json_data, new_file)
            
            # Increment the in_stock counter
            in_stock_count += 1
        
        else:
            # Get the 'id' value to rename the file
            file_id = json_data.get('id', None)
            
            # Define the new file name and path for out_stock
            new_file_name = f"{file_id}.json" if file_id else file_name
            new_file_path = os.path.join(out_stock_folder_path, new_file_name)
            
            # Save the file in the out_stock folder
            with open(new_file_path, 'w') as new_file:
                json.dump(json_data, new_file)
            
            # Increment the out_stock counter
            out_stock_count += 1

# Output the totals for in_stock and out_stock files
print(f"Total in_stock files: {in_stock_count}")
print(f"Total out_stock files: {out_stock_count}")
print (f"Total files: {in_stock_count + out_stock_count}")

Total in_stock files: 65
Total out_stock files: 211
Total files: 276


In [7]:
# List jsons with the pricing history for instock products

# Define the output folder path where the JSON files are currently stored
output_folder_path = os.path.join(base_directory, 'in_stock')

# List to store the names of JSON files containing 'mostRecentPriceSourceURL'
files_with_prices = []

# Iterate through all files in the output folder
for file_name in os.listdir(output_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(output_folder_path, file_name)
        
        # Read and process the JSON file
        with open(file_path, 'r') as file:
            try:
                json_object = json.load(file)
                
                # Check if the JSON object has the key 'mostRecentPriceSourceURL'
                if 'prices' in json_object:
                    files_with_prices.append(file_name)

            except json.JSONDecodeError:
                # Handle or log the error if necessary
                pass

# Print the list of files containing 'mostRecentPriceAmount'
print(f'Total files with "no pricing history": {in_stock_count - len(files_with_prices)}')



Total files with "no pricing history": 0


In [9]:
#list of jsons with the ids key for instock products

# Define the output folder path where the JSON files are currently stored
output_folder_path = os.path.join(base_directory, 'in_stock')

# List to store the names of JSON files containing 'id'
files_with_ids = []

# Iterate through all files in the output folder
for file_name in os.listdir(output_folder_path):
    if file_name.endswith('.json'):  # Process only JSON files
        file_path = os.path.join(output_folder_path, file_name)
        
        # Read and process the JSON file
        with open(file_path, 'r') as file:
            try:
                json_object = json.load(file)
                
                # Check if the JSON object has the key 'id'
                if 'id' in json_object:
                    files_with_ids.append(file_name)

            except json.JSONDecodeError:
                # Handle or log the error if necessary
                pass

# Print the list of files containing 'id'
print(f'Total files with "id": {len(files_with_ids)}')

Total files with "id": 65
