## Setup

In [None]:
!pip install -qq markdown2

In [None]:
import json
import html
import re
from pathlib import Path
import pandas as pd
import markdown2
import os
# Increase max columns and rows displayed
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Show all columns (don't truncate)
pd.set_option('display.max_colwidth', None)

# Format floating point numbers
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Increase width of displayed tables
pd.set_option('display.width', 1000)

# Don't wrap long strings
pd.set_option('display.max_colwidth', None)

# Show precision of 3 decimal places
pd.options.display.precision = 3

In [None]:
def convert_jsonl_to_html(input_file, output_dir):
    """
    Convert JSONL formatted conversation data to styled HTML files.

    Args:
        input_file (str): Path to input JSONL file
        output_dir (str): Directory to save HTML output files
    """
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    # CSS styles for different components
    css_styles = """
    <style>
        body { font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }
        .conversation { border: 1px solid #ddd; padding: 20px; margin: 20px 0; border-radius: 5px; }
        .prompt { background-color: #f0f7ff; padding: 15px; margin: 10px 0; border-left: 4px solid #0066cc; }
        .assistant { background-color: #f5f5f5; padding: 15px; margin: 10px 0; border-left: 4px solid #4CAF50; }
        .reference { background-color: #e8f5e9; padding: 15px; margin: 10px 0; border-left: 4px solid #2e7d32; }
        .judgment { background-color: #fff3e0; padding: 15px; margin: 10px 0; border-left: 4px solid #ff9800; }
        .score { font-weight: bold; color: #d32f2f; }
        .icon { margin-right: 10px; font-weight: bold; }
        .section { margin-bottom: 20px; }
        h2 { color: #333; }
        .metadata { color: #666; font-size: 0.9em; margin-bottom: 15px; }
    </style>
    """

    def extract_conversation_parts(text):
        """Extract different parts of the conversation from the text."""
        parts = []

        try:
            # Extract reference section
            ref_match = re.search(r'<\|The Start of Reference Answer\|(.*?)<\|The End of Reference Answer\|>',
                                text, re.DOTALL)
            if ref_match:
                ref_content = ref_match.group(1)
                # Split reference content into Q&A pairs
                ref_parts = ref_content.split('### ')
                for part in ref_parts:
                    if part.startswith('User:'):
                        parts.append(('prompt', part[5:].strip()))
                    elif part.startswith('Reference answer:'):
                        parts.append(('reference', part[16:].strip()))

            # Extract assistant section
            asst_match = re.search(r'<\|The Start of Assistant A\'s Conversation with User\|(.*?)<\|The End of Assistant A\'s Conversation with User\|>',
                                 text, re.DOTALL)
            if asst_match:
                asst_content = asst_match.group(1)
                # Split assistant content into Q&A pairs
                asst_parts = asst_content.split('### ')
                for part in asst_parts:
                    if part.startswith('User:'):
                        parts.append(('prompt', part[5:].strip()))
                    elif part.startswith('Assistant A:'):
                        parts.append(('assistant', part[12:].strip()))

        except Exception as e:
            print(f"Error in extract_conversation_parts: {e}")
            print(f"Text being processed: {text[:200]}...")  # Print first 200 chars for debugging

        return parts

    # Process each line in the JSONL file
    with open(input_file, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):
            try:
                data = json.loads(line.strip())

                # Extract metadata
                metadata_html = f"""
                <div class="metadata">
                    <strong>Question ID:</strong> {data.get('question_id', 'N/A')} |
                    <strong>Model:</strong> {data.get('model', 'N/A')}
                </div>
                """

                # Extract conversation parts
                user_prompt = data.get('user_prompt', '')
                conversation_parts = extract_conversation_parts(user_prompt)
                # Convert parts to HTML
                conversation_html = []
                for part_type, content in conversation_parts:
                    if part_type == 'prompt':
                        conversation_html.append(f'<div class="prompt"><span class="icon">👤</span>{html.escape(content)}</div>')
                    elif part_type == 'assistant':
                        conversation_html.append(f'<div class="assistant"><span class="icon">🤖</span>{html.escape(content)}</div>')
                    elif part_type == 'reference':
                        conversation_html.append(f'<div class="reference"><span class="icon">📚</span>{html.escape(content)}</div>')
                # Add judgment and score
                judgment_html = f'<div class="judgment"><span class="icon">⚖️</span>{html.escape(data.get("judgment", ""))}</div>'
                score_html = f'<p>Score: <span class="score">{data.get("score", "N/A")}</span></p>'

                # Combine all components
                html_content = f"""
                <!DOCTYPE html>
                <html>
                <head>
                    <title>Conversation {line_number}</title>
                    {css_styles}
                </head>
                <body>
                    <h2>Conversation {line_number}</h2>
                    <div class="conversation">
                        {metadata_html}
                        <div class="section">{"".join(conversation_html)}</div>
                        <div class="section">{judgment_html}</div>
                        <div class="section">{score_html}</div>
                    </div>
                </body>
                </html>
                """
                # Write to output file
                output_file = output_path / f'conversation_{line_number}.html'
                with open(output_file, 'w', encoding='utf-8') as out_f:
                    out_f.write(html_content)

            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_number}: {e}")
            except Exception as e:
                print(f"Error processing line {line_number}: {e}")
                print(f"Line content: {line[:200]}...")  # Print first 200 chars for debugging

    print(f"Processing complete. HTML files have been saved to {output_dir}")

In [None]:
# !rm -rf /content/output_html
# convert_jsonl_to_html("/content/qwen2.5-72b-500k.jsonl", "/content/qwen2.5-72b-500k-resps")

In [None]:
# convert_jsonl_to_html("/content/llama3.3-70b-500k.jsonl", "/content/llama3.3-70b-500k-resps")

In [None]:
def create_html_report(csv_path, output_path='output'):
    # Read the CSV file
    df = pd.read_csv(csv_path)

    # HTML template with styling
    html_template = '''
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>Conversation Report</title>
        <style>
            body {{
                font-family: Arial, sans-serif;
                max-width: 800px;
                margin: 20px auto;
                padding: 20px;
                background-color: #f5f5f5;
            }}
            .conversation-container {{
                background-color: white;
                padding: 20px;
                border-radius: 10px;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }}
            .message {{
                margin-bottom: 20px;
                padding: 15px;
                border-radius: 8px;
            }}
            .question {{
                background-color: #e3f2fd;
                border-left: 5px solid #2196f3;
            }}
            .answer {{
                background-color: #f3e5f5;
                border-left: 5px solid #9c27b0;
            }}
            .judgment {{
                background-color: #fff3e0;
                border-left: 5px solid #ff9800;
            }}
            .icon {{
                font-size: 1.2em;
                margin-right: 10px;
            }}
            .score {{
                font-size: 1.2em;
                font-weight: bold;
                text-align: center;
                margin-top: 20px;
                padding: 10px;
                background-color: #e8f5e9;
                border-radius: 5px;
            }}
            pre {{
                background-color: #f8f9fa;
                padding: 10px;
                border-radius: 4px;
                overflow-x: auto;
            }}
            code {{
                font-family: monospace;
            }}
        </style>
    </head>
    <body>
        <div class="conversation-container">
            {content}
        </div>
    </body>
    </html>
    '''
    output_path = Path(output_path)
    output_path.mkdir(parents=True, exist_ok=True)
    # Process each row
    for _, row in df.iterrows():
        question_id = str(row['question_id'])
        # Initialize content
        content = []
        # Convert markdown to HTML for question and answer
        question_html = markdown2.markdown(str(row['question']).strip())
        answer_html = markdown2.markdown(str(row['answer']).strip())
        judgment_html = markdown2.markdown(str(row['judgment']).strip())
        score_html = markdown2.markdown(str(row['score']).strip())

        # Add question with user icon
        content.append(f'''
            <div class="message question">
                <span class="icon">👤</span>
                {question_html}
            </div>
        ''')

        # Add answer with assistant icon
        content.append(f'''
            <div class="message answer">
                <span class="icon">🤖</span>
                {answer_html}
            </div>
        ''')

        # Add judgment with judge icon
        content.append(f'''
            <div class="message judgment">
                <span class="icon">⚖️</span>
                {judgment_html}
            </div>
        ''')

        # Add final score
        content.append(f'''
            <div class="score">
                Final Score: {score_html}
            </div>
        ''')

        # Combine all content
        full_content = "\n".join(content)

        # Generate final HTML
        print(full_content)
        final_html = html_template.format(content=full_content)
        output_complete_path = output_path / f'{question_id}.html'
        # Write to file
        with open(output_complete_path, 'w', encoding='utf-8') as f:
            f.write(final_html)

    print(f"HTML reports generated successfully at {output_path}")

In [None]:
from bs4 import BeautifulSoup
from collections import Counter
import re

def count_html_elements(html_content):
    """
    Count the frequency of styling and structural elements in HTML content.
    Returns dictionaries for tag counts and style attributes.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize counters
    html_tags = Counter()
    style_attributes = Counter()

    # List of tags to track
    tracked_tags = {
        # Styling tags
        'style', 'link', 'font', 'center', 'div', 'span',
        'strong', 'em', 'i', 'b', 'u', 'strike',

        # Headers
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6',

        # Lists
        'ul', 'ol', 'li', 'dl', 'dt', 'dd',

        # Table elements
        'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',

        # Other structural elements
        'nav', 'header', 'footer', 'main', 'article', 'section',
        'aside', 'p', 'blockquote', 'figure', 'figcaption'
    }

    # Count tags
    for tag in soup.find_all():
        if tag.name in tracked_tags:
            html_tags[tag.name] += 1

        # Count style-related attributes
        for attr in tag.attrs:
            if attr in {'style', 'class', 'id'}:
                style_attributes[attr] += 1
            elif attr == 'rel' and tag.get('rel') == ['stylesheet']:
                style_attributes['stylesheet_link'] += 1

    # Count inline styles and extract properties
    inline_properties = Counter()
    for tag in soup.find_all(style=True):
        style_text = tag['style']
        # Extract CSS properties from inline styles
        properties = re.findall(r'([\w-]+):', style_text)
        inline_properties.update(properties)

    return {
        'tags': dict(html_tags),
        'attributes': dict(style_attributes),
        'inline_properties': dict(inline_properties)
    }

# Example usage
if __name__ == "__main__":
    sample_html = """
    <html>
        <head>
            <style>body { color: blue; }</style>
            <link rel="stylesheet" href="style.css">
        </head>
        <body>
            <header>
                <h1>Main Title</h1>
                <nav>
                    <ul>
                        <li>Home</li>
                        <li>About</li>
                        <li>Contact</li>
                    </ul>
                </nav>
            </header>
            <main>
                <article>
                    <h2>Article Title</h2>
                    <p>Some text here</p>
                    <ul>
                        <li>First item</li>
                        <li>Second item</li>
                    </ul>
                </article>
                <aside>
                    <h3>Sidebar</h3>
                    <div class="container" style="color: red; font-size: 14px;">
                        <span class="text">Hello</span>
                        <strong>Bold</strong>
                        <em>Emphasis</em>
                    </div>
                </aside>
            </main>
        </body>
    </html>
    """

    results = count_html_elements(sample_html)

    print("\nHTML Tags Frequency:")
    # Sort by frequency, then alphabetically
    sorted_tags = sorted(results['tags'].items(), key=lambda x: (-x[1], x[0]))
    for tag, count in sorted_tags:
        print(f"{tag}: {count}")

    print("\nStyling Attributes Frequency:")
    for attr, count in results['attributes'].items():
        print(f"{attr}: {count}")

    print("\nInline Style Properties Frequency:")
    for prop, count in results['inline_properties'].items():
        print(f"{prop}: {count}")


HTML Tags Frequency:
li: 5
ul: 2
article: 1
aside: 1
div: 1
em: 1
h1: 1
h2: 1
h3: 1
header: 1
link: 1
main: 1
nav: 1
p: 1
span: 1
strong: 1
style: 1

Styling Attributes Frequency:
stylesheet_link: 1
class: 2
style: 1

Inline Style Properties Frequency:
color: 1
font-size: 1


In [None]:
def extract_question(s):
  pattern = re.compile(r'(?<=\n\n\[Question\]\n).*?(?=\n\n\[The Start of Assistant\'s Answer\]\n)', re.DOTALL)
  match = pattern.search(s)
  if match:
      return match.group()
  else:
      return ""

def extract_answer(s):
  pattern = re.compile(r'(?<=\n\n\[The Start of Assistant\'s Answer\]\n).*?(?=\n\[The End of Assistant\'s Answer\])', re.DOTALL)
  match = pattern.search(s)
  if match:
      return match.group()

## Choose Models

In [None]:
df_jdg = pd.read_json("/content/gpt-4_single.jsonl", lines=True)
df_jdg["model"].unique()

array(['model_hf_model_args_pretrained=meta-llama__Llama-3.1-8B-Instruct,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-qwen2-7b,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-qwen2-72b,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-Llama-3.3-70B,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-Llama-3.1-8B,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-8B-5blend,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-8B-2blend,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-70B-5blend,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-250k-70B-2blend,dtype=bfloat16',
       'model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-qwen2-7b,dtype=bfloa

In [None]:
df1 = df_jdg[df_jdg["model"] == "model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-qwen2-72b-lc,dtype=bfloat16"]
df2 = df_jdg[df_jdg["model"] == "model_hf_model_args_pretrained=penfever__Llama-3-8B-WildChat-500k-Llama-3.3-70B,dtype=bfloat16"]
df3 = df_jdg[df_jdg["model"] == "model_hf_model_args_pretrained=meta-llama__Llama-3.3-70B-Instruct,dtype=bfloat16,parallelize=True"]
df4 = df_jdg[df_jdg["model"] == "model_hf_model_args_pretrained=Qwen__Qwen2.5-72B-Instruct,dtype=auto,parallelize=True,device_map=auto,max_length=4096"]

for df, model_name in zip([df1, df2, df3, df4], ["l3-8b-wc500k-q25-72b-lc","l3-8b-wc500k-l33-70b", "l33-70b", "q25-72b"]):
  df = df.drop_duplicates(subset=['question_id'])
  df['question'] = df['user_prompt'].apply(extract_question)
  df['answer'] = df['user_prompt'].apply(extract_answer)
  df[["question_id", "question", "answer", "judgment", "score"]].to_csv(f"{model_name}.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['user_prompt'].apply(extract_question)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['answer'] = df['user_prompt'].apply(extract_answer)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['question'] = df['user_prompt'].apply(extract_question)
A value is trying to be set on a

## Generate HTML Reports

In [None]:
create_html_report("/content/qwen2.5-72b-500k.csv", "/content/drive/MyDrive/qwen2.5-72b-500k-resps")

In [None]:
dest_path = '/content/drive/MyDrive/qwen2.5-72b-500k-resps'
qwen_25_500k_master_counter = Counter()
lengths = []
for filename in os.listdir(dest_path):
    complete_path = os.path.join(dest_path, filename)
    if filename.endswith('.html'):
      with open(complete_path, 'r', encoding='utf-8') as file:
          html_content = file.read()
          lengths.append(len(html_content))
          results = count_html_elements(html_content)
          qwen_25_500k_master_counter.update(results['tags'])
          qwen_25_500k_master_counter.update(results['attributes'])
          qwen_25_500k_master_counter.update(results['inline_properties'])

print(qwen_25_500k_master_counter)
import numpy as np
print("average html length")
print(np.mean(lengths))

Counter({'p': 1237, 'li': 881, 'strong': 808, 'class': 640, 'div': 401, 'ul': 260, 'span': 240, 'h3': 149, 'style': 82, 'ol': 77, 'em': 60, 'h4': 33, 'h1': 20, 'id': 3})
average html length
6585.3


In [None]:
create_html_report("/content/llama3.3-70b-500k.csv", "/content/drive/MyDrive/llama3.3-70b-500k-resps")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                <span class="icon">🤖</span>
                <p>The sound of chainsaws and the smell of gasoline fill the air as those deforesters approach me. I feel a deep sense of unease and a growing sense of dread. My ancient branches, which have witnessed countless seasons and have cradled countless creatures, begin to tremble with anxiety.</p>

<p>As the first chainsaw bites into my trunk, I feel a searing pain, like a thousand tiny knives tearing at my flesh. My heart, which has beat steadily for over a century, begins to pound in my chest. I try to remember the countless moments of serenity and peace I've experienced, but the pain and the fear of being destroyed override all those memories.</p>

<p>I feel a deep connection to the forest, to the animals that live within it, and to the people who have visited me over the years. I've been a silent witness to so many joys and sorrows, and the thought of being torn fro

In [None]:
dest_path = '/content/drive/MyDrive/llama3.3-70b-500k-resps'
llama_33_500k_master_counter = Counter()
lengths = []
for filename in os.listdir(dest_path):
    complete_path = os.path.join(dest_path, filename)
    if filename.endswith('.html'):
      with open(complete_path, 'r', encoding='utf-8') as file:
          html_content = file.read()
          lengths.append(len(html_content))
          results = count_html_elements(html_content)
          llama_33_500k_master_counter.update(results['tags'])
          llama_33_500k_master_counter.update(results['attributes'])
          llama_33_500k_master_counter.update(results['inline_properties'])

print(llama_33_500k_master_counter)
import numpy as np
print("average html length")
print(np.mean(lengths))

Counter({'p': 1210, 'class': 640, 'li': 558, 'div': 400, 'strong': 331, 'span': 240, 'ul': 109, 'style': 82, 'ol': 75, 'em': 34, 'h3': 32, 'h1': 27, 'h2': 13, 'h4': 3, 'id': 2})
average html length
6206.85


In [None]:
qwen_25_500k_master_counter - llama_33_500k_master_counter

Counter({'div': 1,
         'p': 27,
         'ol': 2,
         'li': 323,
         'strong': 477,
         'h3': 117,
         'h4': 30,
         'ul': 151,
         'em': 26,
         'id': 1})

In [None]:
llama_33_500k_master_counter - qwen_25_500k_master_counter

Counter({'h1': 7, 'h2': 13})

In [None]:
create_html_report("/content/llama3.3-70b.csv", "/content/drive/MyDrive/llama3.3-70b-resps")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                Final Score: <p>9</p>

            </div>
        

            <div class="message question">
                <span class="icon">👤</span>
                <p>Please read the paragraph below and count how many times the words "Amazon", "river", and "you" appear. Please present the results in the format of "word, number of appearances" with each word on a separate line. Sort the lines in order of the number of appearances.
The Amazon, a mesmerizing expanse of nature's wonders, is home to the legendary Amazon River. Flowing through awe-inspiring landscapes like the Amazon rainforest, the river weaves its way through Brazil, Colombia, and Peru, giving life to countless creatures. From the mighty jaguars prowling the Amazon jungle to the vibrant macaws soaring above the canopy, this remarkable region teems with biodiversity. Deep within the river's currents, magnificent pink river dolphins gracefully glide alon

In [None]:
dest_path = '/content/drive/MyDrive/llama3.3-70b-resps'
master_counter = Counter()
lengths = []
for filename in os.listdir(dest_path):
    complete_path = os.path.join(dest_path, filename)
    if filename.endswith('.html'):
      with open(complete_path, 'r', encoding='utf-8') as file:
          html_content = file.read()
          lengths.append(len(html_content))
          results = count_html_elements(html_content)
          master_counter.update(results['tags'])
          master_counter.update(results['attributes'])
          master_counter.update(results['inline_properties'])

print(master_counter)
import numpy as np
print("average html length")
print(np.mean(lengths))

Counter({'p': 1222, 'class': 640, 'li': 627, 'strong': 406, 'div': 401, 'span': 240, 'ul': 116, 'style': 82, 'ol': 76, 'em': 38, 'h2': 38, 'h1': 30, 'h3': 22, 'h4': 4, 'id': 2})
average html length
6411.0


In [None]:
create_html_report("/content/q25-72b.csv", "/content/drive/MyDrive/qwen-2.5-72b-resps")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<p>Quantum entanglement is a phenomenon where pairs or groups of particles become interconnected in such a way that the state of one particle cannot be described independently of the state of the others, even when the particles are separated by large distances. This interconnection persists regardless of the distance between the particles.</p>

<p>For example, consider two entangled qubits ( A ) and ( B ). If the combined state of the system is:
[ |\psi\rangle<em>{AB} = \frac{1}{\sqrt{2}} (|0\rangle</em>A |1\rangle<em>B + |1\rangle</em>A |0\rangle_B) ]
this means that if qubit ( A ) is measured and found to be in the state ( |0\rangle ), then qubit ( B ) will instantaneously be found in the state ( |1\rangle ), and vice versa. The measurement of one qubit instantly determines the state of the other, no matter how far apart they are.</p>

<h3>Relationship Between Superposition and Quantum Entanglement</h3>

<p>The relation

In [None]:
dest_path = '/content/drive/MyDrive/qwen-2.5-72b-resps'
master_counter = Counter()
lengths = []
for filename in os.listdir(dest_path):
    complete_path = os.path.join(dest_path, filename)
    if filename.endswith('.html'):
      with open(complete_path, 'r', encoding='utf-8') as file:
          html_content = file.read()
          lengths.append(len(html_content))
          results = count_html_elements(html_content)
          master_counter.update(results['tags'])
          master_counter.update(results['attributes'])
          master_counter.update(results['inline_properties'])

print(master_counter)
import numpy as np
print("average html length")
print(np.mean(lengths))

Counter({'p': 1217, 'li': 865, 'strong': 741, 'class': 640, 'div': 401, 'span': 240, 'ul': 230, 'h3': 135, 'ol': 83, 'style': 82, 'em': 53, 'h4': 26, 'h1': 24, 'id': 2})
average html length
6492.4375


In [None]:
# !rsync -avP /content/qwen2.5-72b-500k-resps/ /content/drive/MyDrive/qwen2.5-72b-500k-resps/
# !rsync -avP /content/llama3.3-70b-500k-resps/ /content/drive/MyDrive/llama3.3-70b-500k-resps/

## Comparative Analysis

In [None]:
df1.rename(columns={'judgment': 'judgment_qwen', "score": "score_qwen"}, inplace=True)
df1 = df1.drop_duplicates(subset=['question_id'])
df2.rename(columns={'judgment': 'judgment_llama', "score": "score_llama"}, inplace=True)
df2 = df2.drop_duplicates(subset=['question_id'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.rename(columns={'judgment': 'judgment_qwen', "score": "score_qwen"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns={'judgment': 'judgment_llama', "score": "score_llama"}, inplace=True)


In [None]:
df3.rename(columns={'judgment': 'judgment_llama_70b', "score": "score_llama_70b"}, inplace=True)
df3 = df3.drop_duplicates(subset=['question_id'])
df4.rename(columns={'judgment': 'judgment_qwen_72b', "score": "score_qwen_72b"}, inplace=True)
df4 = df4.drop_duplicates(subset=['question_id'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3.rename(columns={'judgment': 'judgment_llama_70b', "score": "score_llama_70b"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4.rename(columns={'judgment': 'judgment_qwen_72b', "score": "score_qwen_72b"}, inplace=True)


In [None]:
merged_df = pd.merge(df1[["judgment_qwen", "score_qwen", "question_id"]], df2[["judgment_llama", "score_llama", "question_id"]], on='question_id')
len(merged_df)

80

In [None]:
merged_df2 = pd.merge(merged_df, df3[["judgment_llama_70b", "score_llama_70b", "question_id"]], on='question_id')
len(merged_df2)

80

In [None]:
merged_df3 = pd.merge(merged_df2, df4[["judgment_qwen_72b", "score_qwen_72b", "question_id"]], on='question_id')
len(merged_df3)

80

In [None]:
merged_df3.to_csv("all_judgments_merged.csv", index=False)

## Analysis of Judgments

In [None]:
merged_df = pd.read_csv("all_judgments_merged.csv")

In [None]:
for score_name in ["score_llama", "score_qwen", "score_llama_70b", "score_qwen_72b"]:
  print(score_name)
  print(round(merged_df[score_name].mean(), 3))

score_llama
6.888
score_qwen
7.075
score_llama_70b
7.875
score_qwen_72b
8.162


In [None]:
len(merged_df[merged_df["score_qwen_72b"] < merged_df["score_llama_70b"]])

17

In [None]:
len(merged_df[(merged_df["score_qwen_72b"] > merged_df["score_llama"]) & (merged_df["score_llama"] > merged_df["score_qwen"])])

6

In [None]:
l70b_better = merged_df2[merged_df2["score_llama_70b"] > merged_df2["score_qwen"]]
question_ids = l70b_better["question_id"].tolist()
qwen_better = merged_df[merged_df["score_qwen"] > merged_df["score_llama"]]
qwen_question_ids = qwen_better["question_id"].tolist()
flipped = l70b_better[l70b_better["question_id"].isin(qwen_question_ids)]

In [None]:
flipped.to_csv("flipped.csv")

In [None]:
merged_df[merged_df["score_llama"] > merged_df["score_qwen"]]["score_llama"].mean()

7.277777777777778

In [None]:
merged_df[merged_df["score_llama"] > merged_df["score_qwen"]]["score_qwen"].mean()

4.5

In [None]:
merged_df[merged_df["score_llama"] < merged_df["score_qwen"]].to_csv("qwen_better.csv")

## Save HTMLs to PDF

In [None]:
!pip install -qq weasyprint PyPDF2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/300.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/300.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.0/300.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m850.6/850.6 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pathlib import Path
from weasyprint import HTML
from PyPDF2 import PdfMerger
import tempfile

def html_to_pdf(directory_path, output_filename):
    # Create a PDF merger object
    merger = PdfMerger()

    # Create a temporary directory for intermediate PDFs
    with tempfile.TemporaryDirectory() as temp_dir:
        # Get all HTML files in the directory
        html_files = sorted(Path(directory_path).glob('*.html'))

        # Convert each HTML file to PDF and add to merger
        for html_file in html_files:
            # Generate temporary PDF filename
            temp_pdf = Path(temp_dir) / f"{html_file.stem}_temp.pdf"

            # Convert HTML to PDF
            HTML(filename=str(html_file)).write_pdf(str(temp_pdf))

            # Add to merger
            merger.append(str(temp_pdf))

        # Write the combined PDF
        merger.write(output_filename)
        merger.close()

In [None]:
 html_to_pdf("/content/drive/MyDrive/llama3.3-70b-resps", "/content/drive/MyDrive/llama3.3-70b-resps.pdf")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:fontTools.ttLib.ttFont:Decompiling 'GDEF' table
DEBUG:fontTools.subset.timer:Took 0.010s to subset 'GDEF'
INFO:fontTools.subset:GDEF subsetted
INFO:fontTools.subset:name subsetting not needed
DEBUG:fontTools.subset.timer:Took 0.001s to subset 'glyf'
INFO:fontTools.subset:glyf subsetted
DEBUG:fontTools.subset.timer:Took 0.000s to subset GlyphOrder
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'head'
INFO:fontTools.subset:head pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'OS/2' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'OS/2' table
INFO:fontTools.subset:OS/2 Unicode ranges pruned: [0, 1]
INFO:fontTools.subset:OS/2 CodePage ranges pruned: [0]
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'glyf'
INFO:fontTools.subset:glyf pruned
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'GDEF'
INFO:fontTools.subset:GDEF pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'name' table from disk
DEBUG:fontTools.ttLib

In [None]:
sl = ["/content/drive/MyDrive/qwen-2.5-72b-resps", "/content/drive/MyDrive/llama3.3-70b-500k-resps", "/content/drive/MyDrive/qwen2.5-72b-500k-resps"]

for path in sl:
  html_to_pdf(path, path + ".pdf")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
DEBUG:fontTools.ttLib.ttFont:Decompiling 'GDEF' table
DEBUG:fontTools.subset.timer:Took 0.004s to subset 'GDEF'
INFO:fontTools.subset:GDEF subsetted
INFO:fontTools.subset:name subsetting not needed
DEBUG:fontTools.subset.timer:Took 0.002s to subset 'glyf'
INFO:fontTools.subset:glyf subsetted
DEBUG:fontTools.subset.timer:Took 0.000s to subset GlyphOrder
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'head'
INFO:fontTools.subset:head pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'OS/2' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'OS/2' table
INFO:fontTools.subset:OS/2 Unicode ranges pruned: [0, 1, 45]
INFO:fontTools.subset:OS/2 CodePage ranges pruned: [0]
DEBUG:fontTools.subset.timer:Took 0.001s to prune 'glyf'
INFO:fontTools.subset:glyf pruned
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'GDEF'
INFO:fontTools.subset:GDEF pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'name' table from disk
DEBUG:fontTools.t