In [2]:
import json
import os
from tqdm import tqdm  # Corrected from tqdn to tqdm

def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):  # Corrected filter_keys to filter_key
    target_size_bytes = target_size_gb * 1024 ** 3
    current_size_bytes = 0

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile):  # Corrected tqdn to tqdm
            record = json.loads(line)

            if record.get(filter_key):  # Ensures that the filter_key exists in the record
                outfile.write(json.dumps(record) + '\n')
                current_size_bytes += len(line.encode('utf-8'))  # Corrected current_size_byte to current_size_bytes

            if current_size_bytes >= target_size_bytes:
                break

    # Corrected the format string syntax in the print statement
    print(f"Finished sampling. Output size: {current_size_bytes / 1024**3:.2f} GB")



sample_json('data.json','data_.5.json',0.5)



0it [00:00, ?it/s]

451921it [00:21, 20749.64it/s]

Finished sampling. Output size: 0.50 GB





In [3]:
import pandas as pd

filename = 'data_.5.json'

try:
    # Load the entire file as a DataFrame
    df = pd.read_json(filename, lines=True)  # Use lines=True if your JSON objects are separated by new lines
    print(df.head())  # Show the first few entries in the DataFrame
except ValueError as e:
    print("Error reading JSON:", e)



                                            category tech1  \
0  [Clothing, Shoes & Jewelry, Men, Clothing, Jea...         
1  [Clothing, Shoes & Jewelry, Women, Accessories...         
2  [Clothing, Shoes & Jewelry, Women, Clothing, L...         
3  [Clothing, Shoes & Jewelry, Women, Clothing, D...         
4  [Clothing, Shoes & Jewelry, Women, Clothing, L...         

                                         description  \
0  [<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...   
1  [Feature <br> -Great quality winter scarf. <br...   
2  [Material : Core-spun fabric silk <br> feature...   
3  [Material : Core-spun fabric silk <br> feature...   
4  [Material : Core-spun fabric silk <br> feature...   

                                                 fit  \
0   class="a-normal a-align-center a-spacing-smal...   
1                                                      
2   class="a-normal a-align-center a-spacing-smal...   
3   class="a-normal a-align-center a-spacing-smal...   
4         

In [4]:
import json

# Function to clean text by removing special characters and extra spaces
def clean_text(text):
    if isinstance(text, str):
        return ' '.join(text.replace('\n', ' ').replace('\r', ' ').split())
    return None

# Function to preprocess data
def preprocess_data(item):
    cleaned_item = {
        "asin": item.get("asin", None),
        "title": clean_text(item.get("title")),
        "features": [clean_text(f) for f in item.get("feature", [])] if isinstance(item.get("feature"), list) else None,
        "description": clean_text(item.get("description")),
        "brand": clean_text(item.get("brand")),
        "categories": item.get("categories") if isinstance(item.get("categories"), list) else None
    }
    return cleaned_item

# Batch processing function
def process_batches(input_file, output_file, batch_size=1000):
    batch = []
    asin_set = set()  # Set to keep track of unique asin values
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            item = json.loads(line)
            preprocessed_item = preprocess_data(item)
            if preprocessed_item['asin'] and preprocessed_item['title'] and preprocessed_item['asin'] not in asin_set:
                batch.append(preprocessed_item)
                asin_set.add(preprocessed_item['asin'])  # Add the asin to the set
            
            if len(batch) >= batch_size:
                for item in batch:
                    json.dump(item, outfile, ensure_ascii=False)
                    outfile.write('\n')
                batch = []  # Reset the batch after processing

        # Process the remaining batch
        if batch:
            for item in batch:
                json.dump(item, outfile, ensure_ascii=False)
                outfile.write('\n')

print("Batch preprocessing completed successfully!")

# Settings
input_file = 'data_.5.json'
output_file = 'preprocessed_updated_5.json'
batch_size = 1000  # Define the batch size as required

# Execute batch processing
process_batches(input_file, output_file, batch_size)


Batch preprocessing completed successfully!
