## Setup

In [None]:
!pip install -qq datasets colorama markdown2 weasyprint PyPDF2

In [None]:
import datasets
from tqdm.auto import tqdm
from datasets import Dataset, Features, Value, ClassLabel, load_dataset
# from vllm import LLM, SamplingParams
from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')
import json
from colorama import init, Fore, Back, Style
import html
import textwrap
from IPython.display import HTML, display
import numpy as np
import markdown2
import re
import pandas as pd
import os

In [None]:
# Increase max columns and rows displayed
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Show all columns (don't truncate)
pd.set_option('display.max_colwidth', None)

# Format floating point numbers
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Increase width of displayed tables
pd.set_option('display.width', 1000)

# Don't wrap long strings
pd.set_option('display.max_colwidth', None)

# Show precision of 3 decimal places
pd.options.display.precision = 3

## Functions

In [None]:
from pathlib import Path
from weasyprint import HTML as weasyHTML
from PyPDF2 import PdfMerger
import tempfile

def html_to_pdf(directory_path, output_filename):
    # Create a PDF merger object
    merger = PdfMerger()

    # Create a temporary directory for intermediate PDFs
    with tempfile.TemporaryDirectory() as temp_dir:
        # Get all HTML files in the directory
        html_files = sorted(Path(directory_path).glob('*.html'))

        # Convert each HTML file to PDF and add to merger
        for html_file in html_files:
            # Generate temporary PDF filename
            temp_pdf = Path(temp_dir) / f"{html_file.stem}_temp.pdf"

            # Convert HTML to PDF
            weasyHTML(filename=str(html_file)).write_pdf(str(temp_pdf))

            # Add to merger
            merger.append(str(temp_pdf))

        # Write the combined PDF
        merger.write(output_filename)
        merger.close()

In [None]:
class ConversationPrettifier:
    def __init__(self, wrap_width=80):
        self.wrap_width = wrap_width

        # Define default color schemes for different roles
        self.role_colors = {
            'user': '#2ECC71',      # Green
            'assistant': '#3498DB',  # Blue
            'system': '#F1C40F'     # Yellow
        }

        # Define symbols for different roles
        self.role_symbols = {
            'user': '👤',
            'assistant': '🤖',
            'system': '⚙️'
        }

        # CSS styles for the output, now including MathJax styling
        self.css = '''
        <style>
            .comparison-container {
                display: flex;
                gap: 20px;
                margin: 20px 0;
                width: 100%;
            }
            .conversation-column {
                flex: 1;
                min-width: 0;  /* Allows proper flex shrinking */
                border-radius: 8px;
                padding: 15px;
            }
            .column-header {
                font-size: 1.2em;
                font-weight: bold;
                margin-bottom: 15px;
                text-align: center;
                padding: 5px;
                border-radius: 4px;
            }
            .conversation-turn {
                margin: 20px 0;
                font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            }
            .turn-header {
                font-weight: bold;
                margin-bottom: 10px;
                font-size: 1.1em;
            }
            .turn-content {
                margin-left: 20px;
            }
            .metadata {
                color: #666;
                font-size: 0.9em;
            }
            .toxic-warning {
                color: #E74C3C;
            }
            .turn-content pre {
                background-color: #f5f5f5;
                padding: 10px;
                border-radius: 4px;
                margin: 10px 0;
                overflow-x: auto;
            }
            .turn-content code {
                background-color: #f5f5f5;
                padding: 2px 4px;
                border-radius: 3px;
                font-family: Monaco, Menlo, Consolas, "Courier New", monospace;
            }
            .turn-content blockquote {
                border-left: 4px solid #ccc;
                margin: 10px 0;
                padding-left: 10px;
                color: #666;
            }
            .turn-content ul, .turn-content ol {
                margin: 10px 0;
                padding-left: 30px;
            }
            .turn-content p {
                margin: 10px 0;
                line-height: 1.5;
            }
            .turn-content h1, .turn-content h2, .turn-content h3,
            .turn-content h4, .turn-content h5, .turn-content h6 {
                margin: 15px 0 10px 0;
                font-weight: bold;
            }
            .turn-content table {
                border-collapse: collapse;
                margin: 10px 0;
                width: 100%;
            }
            .turn-content th, .turn-content td {
                border: 1px solid #ddd;
                padding: 8px;
                text-align: left;
            }
            .turn-content th {
                background-color: #f5f5f5;
            }
            .shared-prompt {
                background-color: #f8f9fa;
                padding: 15px;
                border-radius: 8px;
                margin: 20px 0;
            }
            /* LaTeX math styling */
            .math-inline {
                display: inline-block;
                margin: 0 0.25em;
            }
            .math-display {
                display: block;
                margin: 1em 0;
                text-align: center;
            }
        </style>
        '''

        # MathJax configuration and script tags
        self.mathjax_config = '''
        <script>
            window.MathJax = {
                tex: {
                    inlineMath: [['$', '$'], ['\\\\(', '\\\\)']],
                    displayMath: [['$$', '$$'], ['\\\\[', '\\\\]']],
                    processEscapes: true,
                    processEnvironments: true
                },
                options: {
                    ignoreHtmlClass: 'tex2jax_ignore',
                    processHtmlClass: 'tex2jax_process'
                }
            };
        </script>
        <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
        '''

    def clean_content(self, content):
        """Clean and prepare content for markdown and LaTeX rendering."""
        if content is None:
            return ""

        content = content.replace('\\n', '\n')
        content = html.unescape(content)
        content = self.preserve_code_blocks(content)
        content = self.preserve_latex_math(content)

        return content

    def preserve_latex_math(content):
        """
        Preserve LaTeX math expressions during markdown processing.
        Handles both display and inline math with proper spacing and square bracket notation.

        Args:
            content (str): The input text containing LaTeX math expressions

        Returns:
            str: Text with LaTeX expressions wrapped in appropriate HTML tags
        """
        # Track positions of all math expressions to avoid nested processing
        protected_ranges = []
        processed_content = list(content)

        def is_protected(start, end):
            """Check if a range exactly matches any protected math expression."""
            return (start, end) in protected_ranges

        def protect_range(start, end):
            """Add a range to protected ranges and keep them sorted."""
            protected_ranges.append((start, end))
            protected_ranges.sort()

        def replace_math(pattern, wrapper_template, is_display=False):
            """
            Replace math expressions matching the pattern with HTML-wrapped versions.

            Args:
                pattern (str): Regex pattern to match math expressions
                wrapper_template (str): HTML template to wrap the math
                is_display (bool): Whether this is display math (defaults to False)
            """
            for match in re.finditer(pattern, content, re.DOTALL):
                start, end = match.span()
                if is_protected(start, end):
                    continue

                math_content = match.group(1)
                # Handle possible line breaks in display math while preserving indentation
                if is_display:
                    # Remove all newlines and normalize spaces
                    math_content = ' '.join(line.strip() for line in math_content.split('\n'))
                else:
                    math_content = math_content.strip()

                replacement = wrapper_template.format(math=math_content)

                # Replace content in the list
                processed_content[start:end] = replacement

                # Protect this range
                protect_range(start, end)

            # Process display math with brackets first - most specific
            replace_math(
                r'\\\[(.*?)\\\]',
                '<div class="math-display">\\[{math}\\]</div>',
                is_display=True
            )

            # Process display math with double dollars
            replace_math(
                r'\$\$(.*?)\$\$',
                '<div class="math-display">${math}$</div>',
                is_display=True
            )

            # Process inline math last - must handle edge cases with currency vs math
            replace_math(
                r'(?<!\$)\$(?![\d\$])([^$]+?)(?<![\d\$])\$(?!\$)',
                '<span class="math-inline">${math}$</span>'
            )

            return ''.join(processed_content)

    def preserve_code_blocks(self, content):
        """Preserve code blocks during markdown processing."""
        def replace_code_block(match):
            code = match.group(2)
            lang = match.group(1) if match.group(1) else ''
            return f'<pre><code class="language-{lang}">{html.escape(code)}</code></pre>'

        content = re.sub(r'```(\w*)\n(.*?)```', replace_code_block, content, flags=re.DOTALL)
        return content

    def format_metadata(self, turn):
        """Format the metadata section of a conversation turn."""
        metadata = []

        if turn.get('toxic'):
            metadata.append('<span class="toxic-warning">⚠️ Toxic</span>')
        if turn.get('language'):
            metadata.append(f"Lang: {turn['language']}")
        if turn.get('finish_reason'):
            metadata.append(f"Finish: {turn['finish_reason']}")

        return ' | '.join(metadata) if metadata else ''

    def wrap_text(self, text):
        """Wrap text while preserving markdown and LaTeX structure."""
        def should_wrap(line):
            # Don't wrap lines containing LaTeX math
            if '$' in line:
                return False
            return not (line.startswith(('    ', '\t', '- ', '* ', '1. ', '> ', '#', '```')))

        lines = text.split('\n')
        wrapped_lines = []

        for line in lines:
            if should_wrap(line) and len(line.strip()) > 0:
                wrapped = textwrap.fill(line, width=self.wrap_width,
                                      break_long_words=False,
                                      replace_whitespace=False)
                wrapped_lines.append(wrapped)
            else:
                wrapped_lines.append(line)

        return '\n'.join(wrapped_lines)

    def format_conversation(self, conversation_data, color_scheme=None):
        """Format a single conversation with optional custom colors."""
        html_parts = []

        for turn in conversation_data:
            role = turn.get('role', 'unknown')
            color = color_scheme.get(role, self.role_colors.get(role, '#666666')) if color_scheme else self.role_colors.get(role, '#666666')
            symbol = self.role_symbols.get(role, '❓')

            metadata = self.format_metadata(turn)
            header_html = f'''
            <div class="conversation-turn">
                <div class="turn-header" style="color: {color}">
                    {symbol} {role.upper()}
                    {f'<span class="metadata">({metadata})</span>' if metadata else ''}
                </div>
            '''

            content = self.clean_content(turn.get('content', ''))
            if content:
                wrapped_content = self.wrap_text(content)
                html_content = markdown2.markdown(wrapped_content,
                    extras=['fenced-code-blocks', 'tables', 'break-on-newline'])

                content_html = f'''
                <div class="turn-content">
                    {html_content}
                </div>
            </div>
                '''
                html_parts.append(header_html + content_html)
            else:
                html_parts.append(header_html + '</div>')

        return ''.join(html_parts)

    def compare_conversations(self, conv1, conv2, title1="Response 1", title2="Response 2",
                            color_scheme1=None, color_scheme2=None,
                            background1="#f0f7ff", background2="#fff0f0"):
        """Compare two conversations side by side with custom colors and titles."""
        if isinstance(conv1, str):
            conv1 = json.loads(conv1)
        if isinstance(conv2, str):
            conv2 = json.loads(conv2)

        # Find shared prompt (user messages) if any
        shared_prompt_html = ""
        if conv1 and conv2 and conv1[0]['role'] == 'user' and conv2[0]['role'] == 'user':
            if conv1[0]['content'] == conv2[0]['content']:
                prompt_content = self.clean_content(conv1[0]['content'])
                prompt_html = markdown2.markdown(prompt_content,
                    extras=['fenced-code-blocks', 'tables', 'break-on-newline'])
                shared_prompt_html = f'''
                <div class="shared-prompt">
                    <strong>Shared Prompt:</strong>
                    <div class="turn-content">
                        {prompt_html}
                    </div>
                </div>
                '''
                # Remove shared prompt from both conversations
                conv1 = conv1[1:]
                conv2 = conv2[1:]

        html_content = f'''
        {self.css}
        {self.mathjax_config}
        {shared_prompt_html}
        <div class="comparison-container">
            <div class="conversation-column" style="background-color: {background1}">
                <div class="column-header" style="background-color: {background1}">
                    {title1}
                </div>
                {self.format_conversation(conv1, color_scheme1)}
            </div>
            <div class="conversation-column" style="background-color: {background2}">
                <div class="column-header" style="background-color: {background2}">
                    {title2}
                </div>
                {self.format_conversation(conv2, color_scheme2)}
            </div>
        </div>
        '''

        return HTML(html_content)

    def save_html(self, html_object, filename):
        """Save the HTML output to a file."""
        # Extract HTML string from IPython.display.HTML object
        html_string = html_object.data

        # Add HTML5 doctype and basic structure
        full_html = f"""<!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Conversation Comparison</title>
        </head>
        <body>
        {html_string}
        </body>
        </html>"""

        # Save to file
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(full_html)

In [None]:
# Example usage with comparison
conversation1 = '''[
    {
        "role": "user",
        "content": "What is markdown?",
        "toxic": false
    },
    {
        "role": "assistant",
        "content": "# Markdown Overview\\n\\nMarkdown is a lightweight markup language.",
        "finish_reason": "stop"
    }
]'''

conversation2 = '''[
    {
        "role": "user",
        "content": "What is markdown?",
        "toxic": false
    },
    {
        "role": "assistant",
        "content": "Markdown is a text-to-HTML conversion tool.\\n\\n```markdown\\n# Example\\n**Bold** and *italic*\\n```",
        "finish_reason": "stop"
    }
]'''

prettifier = ConversationPrettifier(wrap_width=80)
display(prettifier.compare_conversations(
    conversation1,
    conversation2,
    title1="First Response",
    title2="Second Response",
    color_scheme1={'assistant': '#1a5f7a'},
    color_scheme2={'assistant': '#7a1a1a'},
    background1="#e6f3ff",
    background2="#ffe6e6"
))

In [None]:
def is_unique(example):
    content = get_first_message_content(example)
    if content is None:
        return False
    if content in seen_contents:
        return False
    seen_contents.add(content)
    return True

def get_first_message_content(example):
    # Access the first message's content from the conversation
    if example['conversation'] and len(example['conversation']) > 0:
        return example['conversation'][0]['content']
    return None

def strip_begin_of_text(l):
  for d in l:
    d['content'] = d['content'].replace("<|begin_of_text|>", "")
  return l

def convert_conversation_format(messages):
    # Convert each message in the list to the desired dictionary format
    output = []
    for message in messages:
        output.append({
            "role": "user",
            "content": message
        })

    return output

## Preprocessing for HF WildChat

In [None]:
ds1_name = "allenai_WildChat-1M-Full-meta-llama_Llama-3.3-70B-Instruct"
ds2_name = "allenai_WildChat-1M-Full-Qwen_Qwen2.5-72B-Instruct"

In [None]:
ds1 = load_dataset(f"penfever/{ds1_name}", split='train')
# Create a set of unique first message contents
seen_contents = set()
# Filter the dataset to keep only rows with unique first messages
ds1 = ds1.filter(is_unique)
ds2 = load_dataset(f"penfever/{ds2_name}", split='train')
eh_df = pd.read_csv("/content/english_only_conv_hash.csv")
hash_set = set(eh_df['conversation_hash'])
ds2 = ds2.filter(lambda example : example["conversation_hash"] in hash_set)
indices = set(ds1['conversation_hash']).intersection(ds2['conversation_hash'])
indices = indices.intersection(hash_set)
print(len(indices))
ds1 = ds1.filter(lambda example : example["conversation_hash"] in indices)
ds2 = ds2.filter(lambda example : example["conversation_hash"] in indices)

Resolving data files:   0%|          | 0/37 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/37 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/36 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

207592


In [None]:
#save ds1, ds2 to disk
ds1.save_to_disk("ds1.hf")
ds2.save_to_disk("ds2.hf")

Saving the dataset (0/8 shards):   0%|          | 0/207592 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/209980 [00:00<?, ? examples/s]

In [None]:
!rsync -avP /content/ds1.hf /content/drive/MyDrive

sending incremental file list
ds1.hf/
ds1.hf/data-00000-of-00008.arrow
    463,644,416 100%  114.25MB/s    0:00:03 (xfr#1, to-chk=9/11)
ds1.hf/data-00001-of-00008.arrow
    722,088,872 100%   68.82MB/s    0:00:10 (xfr#2, to-chk=8/11)
ds1.hf/data-00002-of-00008.arrow
    865,336,272 100%   58.98MB/s    0:00:13 (xfr#3, to-chk=7/11)
ds1.hf/data-00003-of-00008.arrow
  1,078,309,952 100%   69.81MB/s    0:00:14 (xfr#4, to-chk=6/11)
ds1.hf/data-00004-of-00008.arrow
    936,913,648 100%   66.95MB/s    0:00:13 (xfr#5, to-chk=5/11)
ds1.hf/data-00005-of-00008.arrow
    860,940,064 100%   57.45MB/s    0:00:14 (xfr#6, to-chk=4/11)
ds1.hf/data-00006-of-00008.arrow
  1,030,363,408 100%   59.44MB/s    0:00:16 (xfr#7, to-chk=3/11)
ds1.hf/data-00007-of-00008.arrow
    955,136,464 100%   75.28MB/s    0:00:12 (xfr#8, to-chk=2/11)
ds1.hf/dataset_info.json
         16,645 100%   16.34kB/s    0:00:00 (xfr#9, to-chk=1/11)
ds1.hf/state.json
            663 100%    0.65kB/s    0:00:00 (xfr#10, to-chk=0/11)

sen

In [None]:
!rsync -avP /content/ds2.hf /content/drive/MyDrive

sending incremental file list
ds2.hf/
ds2.hf/data-00000-of-00004.arrow
    383,151,568 100%   70.39MB/s    0:00:05 (xfr#1, to-chk=5/7)
ds2.hf/data-00001-of-00004.arrow
    554,193,632 100%   50.62MB/s    0:00:10 (xfr#2, to-chk=4/7)
ds2.hf/data-00002-of-00004.arrow
    462,695,264 100%   47.49MB/s    0:00:09 (xfr#3, to-chk=3/7)
ds2.hf/data-00003-of-00004.arrow
    518,638,040 100%   63.48MB/s    0:00:07 (xfr#4, to-chk=2/7)
ds2.hf/dataset_info.json
          6,558 100%   11.12kB/s    0:00:00 (xfr#5, to-chk=1/7)
ds2.hf/state.json
            427 100%    0.72kB/s    0:00:00 (xfr#6, to-chk=0/7)

sent 1,919,154,416 bytes  received 134 bytes  60,925,541.27 bytes/sec
total size is 1,918,685,489  speedup is 1.00


### Post-Processing with Converted DataFrames

In [None]:
ds1 = datasets.load_from_disk("/content/drive/MyDrive/ds1.hf")

In [None]:
pd_df = ds1.select_columns(["conversation", "conversation_hash"]).to_pandas()

del ds1

selected_ds1 = pd_df.sort_values('conversation_hash')

import gc

gc.collect()

0

In [None]:
# selected_ds1.to_parquet('/content/drive/MyDrive/ds1_sorted.parquet', engine='pyarrow')


In [None]:
ds2 = datasets.load_from_disk("/content/drive/MyDrive/ds2.hf")


In [None]:


pd_df = ds2.to_pandas()

del ds2

selected_ds2 = pd_df.sort_values('conversation_hash')

del pd_df

gc.collect()

0

In [None]:
# selected_ds2.to_parquet('/content/drive/MyDrive/ds2_sorted.parquet', engine='pyarrow')

In [None]:
selected_ds1['conversation'] = selected_ds1['conversation'].apply(strip_begin_of_text)

In [None]:
merged_df = pd.merge(selected_ds1, selected_ds2, on='conversation_hash', suffixes=('_1', '_2'))

## Generate Model Comparsion HTMLs from HuggingFace WildChat Repos

In [None]:
np.random.seed(42)
subset_indices = np.random.choice(len(selected_ds1), size=100, replace=False)

In [None]:
merged_df.iloc[subset_indices].to_csv("subset.csv", index=False)

In [None]:
display(prettifier.compare_conversations(
    merged_df['conversation_1'].iloc[subset_indices[0]].tolist(),
    merged_df['conversation_2'].iloc[subset_indices[0]].tolist(),
    title1=ds1_name,
    title2=ds2_name,
    color_scheme1={'assistant': '#1a5f7a'},
    color_scheme2={'assistant': '#7a1a1a'},
    background1="#e6f3ff",
    background2="#ffe6e6"
))

In [None]:
comparison_output = prettifier.compare_conversations(
    merged_df['conversation_1'].iloc[1834].tolist(),
    merged_df['conversation_2'].iloc[1834].tolist(),
    title1=ds1_name,
    title2=ds2_name,
    color_scheme1={'assistant': '#1a5f7a'},
    color_scheme2={'assistant': '#7a1a1a'},
    background1="#e6f3ff",
    background2="#ffe6e6"
)
prettifier.save_html(comparison_output, "comparison_12.html")

### Save Several Conversations

In [None]:
np.random.seed(42)
subset_indices = np.random.choice(len(selected_ds1), size=100, replace=False)
base_path = "/content/drive/MyDrive/llama3.3-qwen2.5-comparison-v2"
os.makedirs(base_path, exist_ok=True)

for i in range(100):
  comparison_output = prettifier.compare_conversations(
      merged_df['conversation_1'].iloc[subset_indices[i]].tolist(),
      merged_df['conversation_2'].iloc[subset_indices[i]].tolist(),
      title1=ds1_name,
      title2=ds2_name,
      color_scheme1={'assistant': '#1a5f7a'},
      color_scheme2={'assistant': '#7a1a1a'},
      background1="#e6f3ff",
      background2="#ffe6e6"
  )
  save_path = os.path.join(base_path, f"comparison_{subset_indices[i]}.html")
  prettifier.save_html(comparison_output, save_path)

In [None]:

 html_to_pdf("/content/drive/MyDrive/llama3.3-qwen2.5-comparison", "/content/drive/MyDrive/llama3.3-qwen2.5-comparison.pdf")


## Preprocessing for MT Bench

In [None]:
# df = pd.read_json("/content/drive/MyDrive/data/annotations_seed0_configs.json")
# df.sample(5)
df1 = pd.read_json("/content/drive/MyDrive/data/mtbench/model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-Llama-3.3-70B,dtype=bfloat16.jsonl", lines=True)
df2 = pd.read_json("/content/drive/MyDrive/data/mtbench/model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-qwen2-72b-lc,dtype=bfloat16.jsonl", lines=True)
dfp = pd.read_json("question.jsonl", lines=True)

In [None]:
dfp["turns_conv"] = dfp["turns"].apply(lambda x : convert_conversation_format(x))

In [None]:
dfp.sample(2)

Unnamed: 0,question_id,category,turns,reference,turns_conv
22,103,reasoning,"[Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?, Can you explain why the above question is interesting?]","[Thomas may work at a hospital., ]","[{'role': 'user', 'content': 'Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?'}, {'role': 'user', 'content': 'Can you explain why the above question is interesting?'}]"
32,113,math,"[In a survey conducted at a local high school, preferences for a new school color were measured: 58% of students liked the color blue, 45% preferred green, and 22% liked both colors. If we randomly pick a student from the school, what's the probability that they would like neither blue nor green?, If we select a student liked green, what's the probability that he or she would dislike both colors?]","[19%, 0%]","[{'role': 'user', 'content': 'In a survey conducted at a local high school, preferences for a new school color were measured: 58% of students liked the color blue, 45% preferred green, and 22% liked both colors. If we randomly pick a student from the school, what's the probability that they would like neither blue nor green?'}, {'role': 'user', 'content': 'If we select a student liked green, what's the probability that he or she would dislike both colors?'}]"


In [None]:
def choices_to_conv(sample):
  turns = sample[0]['turns']
  return_ds = []
  for turn in turns:
    return_ds.append({
        "role": "assistant",
        "content": turn,
    })
  return return_ds

df1["conv"] = df1["choices"].apply(lambda x : choices_to_conv(x))

In [None]:
df2["conv"] = df2["choices"].apply(lambda x : choices_to_conv(x))

In [None]:
df1_ss = df1[["question_id","conv"]]
dfp_ss = dfp[["question_id","turns_conv"]]
merged_df = pd.merge(df1_ss, dfp_ss, on='question_id')

In [None]:
df2_ss = df2[["question_id","conv"]]
merged_df = pd.merge(merged_df, df2_ss, on='question_id')

In [None]:
# Method 1: Using apply() with a lambda function
def combine_user_and_model_x(row):
    combined_list = []
    for user_turn, model_turn in zip(row['turns_conv'], row['conv_x']):
        combined_list.append(user_turn)
        combined_list.append(model_turn)
    return combined_list

def combine_user_and_model_y(row):
    combined_list = []
    for user_turn, model_turn in zip(row['turns_conv'], row['conv_y']):
        combined_list.append(user_turn)
        combined_list.append(model_turn)
    return combined_list

In [None]:


# Apply the function to each row of the DataFrame

merged_df['conversation_1'] = merged_df.apply(combine_user_and_model_x, axis=1)
merged_df['conversation_2'] = merged_df.apply(combine_user_and_model_y, axis=1)

## Generate Model Comparsion HTMLs for MT Bench

In [None]:
np.random.seed(42)
subset_indices = np.random.choice(len(merged_df), size=10, replace=False)
ds1_name = "Llama-3-8B-WildChat-500k-Llama-3.3-70B"
ds2_name = "Llama-3-8B-WildChat-500k-qwen2-72b-lc"

In [None]:
# For a comparison
comparison_output = prettifier.compare_conversations(
    merged_df['conversation_1'].iloc[subset_indices[9]],
    merged_df['conversation_2'].iloc[subset_indices[9]],
    title1=ds1_name,
    title2=ds2_name,
    color_scheme1={'assistant': '#1a5f7a'},
    color_scheme2={'assistant': '#7a1a1a'},
    background1="#e6f3ff",
    background2="#ffe6e6"
)
prettifier.save_html(comparison_output, "mtbench_comparison_10.html")