### Install necessary libraries

In [77]:
# !pip install pandas datasets huggingface_hub pyarrow

__This section of the code imports a dataset from huggingface and makes changes to the dataframe.__

In [57]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
import json
import re
from typing import Dict, Any, Union, Optional

In [58]:
def load_and_prepare_dataset(
    dataset_name: str,
    json_column: str
) -> pd.DataFrame:
    """
    Load a dataset from HuggingFace and prepare it for JSON parsing.
    
    Args:
        dataset_name (str): Name of the dataset on HuggingFace (e.g., 'dataset_owner/dataset_name')
        json_column (str): Name of the column containing JSON strings
    
    Returns:
        pd.DataFrame: DataFrame with the original and parsed JSON data
    """
    # Load dataset from HuggingFace
    dataset = load_dataset(dataset_name)
    
    # Convert to pandas DataFrame (assuming we want the 'train' split)
    df = pd.DataFrame(dataset['train'])
    
    return df

In [68]:
def extract_system_and_prompt(text: str) -> Dict[str, str]:
    """
    Extract system message and prompt from text with XML-style tags.
    
    Args:
        text: Input text containing system message in <<SYS>> tags
        
    Returns:
        Dictionary with 'system' and 'prompt' keys containing extracted text
    """
    # Remove <s> tag if present
    text = text.replace('<s>', '').strip()
    
    system_pattern = r'<<SYS>>(.*?)<</SYS>>'
    system_match = re.search(system_pattern, text, re.DOTALL)
    
    system = system_match.group(1).strip() if system_match else ""
    prompt = re.sub(r'<<SYS>>.*?<</SYS>>', '', text, flags=re.DOTALL).strip()
    prompt = prompt.replace('[INST]', '').replace('[/INST]', '').strip()
    
    return {
        "system": system,
        "prompt": prompt
    }

In [69]:
def process_column(
    df: pd.DataFrame,
    source_column: str,
    target_column: str = 'parsed_data'
) -> pd.DataFrame:
    """
    Process JSON column and restructure data with system/prompt separation.
    
    Args:
        df: Input DataFrame
        source_column: Name of column containing source JSON data
        target_column: Name of column to store processed JSON data
        
    Returns:
        DataFrame with added column containing processed JSON
    """
    def parse_and_restructure(row_data: Dict) -> str:
        """Parse dictionary data and restructure into system/prompt format."""
        try:
            result = {}
            for key in ['block_summary', 'detailed_global_summary', 'high_level_global_summary']:
                if key in row_data:
                    result[key] = extract_system_and_prompt(row_data[key])
            return json.dumps(result, indent=2)
        except (json.JSONDecodeError, AttributeError, TypeError):
            return None

    df[target_column] = df[source_column].apply(parse_and_restructure)
    return df

In [70]:
# Configuration
DATASET_NAME = "GaTech-EIC/MG-Verilog"
STR_COLUMN = "description"
PARSED_COLUMN = "parsed_description"

In [71]:
# Load and process the dataset
df = load_and_prepare_dataset(DATASET_NAME, STR_COLUMN)

In [72]:
# Parse JSON column
df = process_column(df, STR_COLUMN, PARSED_COLUMN)

Check outputs.

In [75]:
print(df['parsed_description'].iloc[0])

{
  "block_summary": {
    "system": "You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions.",
    "prompt": "Implement the Verilog module based on the following block level summaries. Assume that signals are positive clock/clk edge triggered unless otherwise stated.\nHere are block level summaries:\n\nblock_0:   This is a Verilog module that implements a data mover for a DMA (Direct Memory Access) controller. It has several inputs and outputs, and it uses a state machine to manage the data transfer between the source and destination.\n\nThe module has several parameters that define the width of various signals, such as ID_WIDTH, S_AXIS_DATA_WIDTH, LENGTH_WIDTH, and BEATS_PER_BURST_WIDTH. These parameters are used to configure the module for a specific application.\n\nThe module has several inputs, including:\n\n* s_axis_aclk: The clock signal for the source axis.\n* s_axis_

In [76]:
print(df['description'].iloc[0])

{'block_summary': "\n    <s>[INST] <<SYS>>\n    You only complete chats with syntax correct Verilog code. End the Verilog module code completion with 'endmodule'. Do not include module, input and output definitions.\n    <</SYS>>\n\n    Implement the Verilog module based on the following block level summaries. Assume that signals are positive clock/clk edge triggered unless otherwise stated.\nHere are block level summaries:\n\nblock_0:   This is a Verilog module that implements a data mover for a DMA (Direct Memory Access) controller. It has several inputs and outputs, and it uses a state machine to manage the data transfer between the source and destination.\n\nThe module has several parameters that define the width of various signals, such as ID_WIDTH, S_AXIS_DATA_WIDTH, LENGTH_WIDTH, and BEATS_PER_BURST_WIDTH. These parameters are used to configure the module for a specific application.\n\nThe module has several inputs, including:\n\n* s_axis_aclk: The clock signal for the source ax

__This section of the code converts to parquet and uploads it to Huggingface.__

In [91]:
#Import necessary libraries
from huggingface_hub import HfApi, login
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [85]:
def upload_parquet_to_hf(
    df,
    repo_id: str,
    token: str,
    filename: str = "data.parquet"
) -> None:
    """Upload parquet file to existing HuggingFace dataset."""
    login(token)
    api = HfApi()
    
    # Save DataFrame as parquet
    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)
    
    # Upload to HuggingFace
    api.upload_file(
        path_or_fileobj=filename,
        path_in_repo=filename,
        repo_id=repo_id,
        repo_type="dataset"
    )

    # Clean up local file
    os.remove(filename)

In [90]:
#Configuration
HF_TOKEN = "your_write_token"  # Get from HuggingFace
REPO_ID = "username/repository-name"

In [89]:
#Upload to huggingface
upload_parquet_to_hf(df=df, repo_id=REPO_ID, token=HF_TOKEN)

data.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]