This is the best code


In [1]:
import re

def parse_markdown_corrected(md_content):
    """
    Parse the markdown content to extract relevant data, with corrected handling for price conversion.
    """
    # Extracting different sections using regular expressions
    manufacturer = re.search(r'# Manufacturer: (.+)', md_content)
    product = re.search(r'## Product: (.+)', md_content)
    features = re.findall(r'- Feature \d+: (.+)', md_content)
    description = re.search(r'\*\*Description:\*\*\n(.+)', md_content)
    specifications = re.findall(r'- (.+): (.+)', md_content)
    benefits = re.findall(r'- Benefit \d+: (.+)', md_content)
    price = re.search(r'\*\*Price:\*\*\n\$(.+)', md_content)

    # Extracting data and handling missing data
    data = {
        'Manufacturer': manufacturer.group(1) if manufacturer else None,
        'Product': product.group(1) if product else None,
        'Features': ', '.join(features) if features else None,
        'Description': description.group(1).strip() if description else None,
        'Specifications': dict(specifications) if specifications else None,
        'Benefits': ', '.join(benefits) if benefits else None,
        'Price': float(price.group(1).replace(',', '')) if price else None
    }
    return data


In [2]:
import os
import pandas as pd

# Directory containing markdown files
directory = "../data/sample"

# List to store data from each file
all_data = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".md"):
        file_path = os.path.join(directory, filename)

        # Read the content of the file
        with open(file_path, 'r') as file:
            content = file.read()

        # Parse the content and add to the list
        parsed_data = parse_markdown_corrected(content)
        all_data.append(parsed_data)

# Creating a DataFrame from the list
df = pd.DataFrame(all_data)

In [3]:
# Display rows of the DataFrame
df.head(2)

Unnamed: 0,Manufacturer,Product,Features,Description,Specifications,Benefits,Price
0,XYZ Medical Devices,pcare room-server,"High-resolution display, Touchscreen interface...",This is a description of the pcare room-server...,"{'Feature 1': 'High-resolution display', 'Feat...","Improved patient care, Ease of use",1999.99
1,eVideon,evideon bright light,"High-resolution display, Touchscreen interface...",This is a description of the pcare room-server...,"{'Feature 1': 'High-resolution display', 'Feat...","Improved patient care, Ease of use",1999.99


In [4]:
from datasets import Dataset
import tqdm as notebook_tqdm

# Assuming 'df' is your Pandas DataFrame
hf_dataset = Dataset.from_pandas(df)

In [5]:
# Assuming 'hf_dataset' is your Hugging Face Dataset
first_row = hf_dataset[0]

# Display the first row
print(first_row)

{'Manufacturer': 'XYZ Medical Devices', 'Product': 'pcare room-server', 'Features': 'High-resolution display, Touchscreen interface, Compatibility with medical equipment, Personalized Content Recommendations: Receive tailored content suggestions., Ambient Lighting Control: Adjust room lighting for a relaxing atmosphere., Bedside Control Panel: Conveniently control the system from the bedside., Video Conferencing Capability: Stay connected with loved ones and healthcare providers., Multilingual Support: Enjoy content and interface options in multiple languages.', 'Description': 'This is a description of the pcare room-server product.', 'Specifications': {'Benefit 1': 'Improved patient care', 'Benefit 2': 'Ease of use', 'Dimensions': '10" x 12" x 3"', 'Feature 1': 'High-resolution display', 'Feature 2': 'Touchscreen interface', 'Feature 3': 'Compatibility with medical equipment', 'Feature 4': None, 'Feature 5: Medical Information Display': None, 'Feature 5: Personalized Content Recommend

In [7]:
# Write DataFrame to JSONL
df.to_json('../data/huggingface/patient-experience-solutions.jsonl', orient='records', lines=True)