In [1]:
import ujson as json
import csv
from tqdm import tqdm

def flatten_json(obj):
    flat = {}
    for key, value in obj.items():
        if isinstance(value, dict):
            flat.update(value)
        else:
            flat[key] = value
    return flat

In [2]:
def extract_headers(input_file):

    headers = set()
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Scanning for headers"):
            try:
                data = json.loads(line)
                flat = flatten_json(data)
                headers.update(flat.keys())
            except Exception as e:
                continue 
    return sorted(headers)


In [3]:
def write_csv(input_file, output_file, headers):

    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        writer = csv.DictWriter(outfile, fieldnames=headers, extrasaction='ignore')
        writer.writeheader()

        for line in tqdm(infile, desc="Writing to CSV"):
            try:
                data = json.loads(line)
                flat = flatten_json(data)
                writer.writerow(flat)
            except Exception:
                continue 

In [4]:

input_file = "/Users/spartan/Downloads/HotelRec.txt"
output_file = "/Users/spartan/Desktop/Hotel_Recommendation_System/data/raw/HotelDataset.csv"


# Extract headers
headers = extract_headers(input_file)
print(headers)





Scanning for headers: 50264531it [05:15, 159488.02it/s]

['author', 'business service (e.g., internet access)', 'check in / front desk', 'cleanliness', 'date', 'hotel_url', 'location', 'rating', 'rooms', 'service', 'sleep quality', 'text', 'title', 'ur_question.prompt.11', 'userrating.prompt.190', 'userrating.prompt.46', 'userrating.prompt.48', 'value']





In [5]:
write_csv(input_file, output_file, headers)

print("Conversion complete!")

Writing to CSV: 50264531it [18:32, 45169.82it/s]

Conversion complete!



