In [31]:
import os, json
import pandas as pd
import random
import numpy as np
from collections import defaultdict


In [23]:
data = pd.read_json("sample_sfu_combined.jsonl", lines=True)
print(len(data))
duplicates = data[data.duplicated('sentence_text', keep=False)]
duplicates

600


Unnamed: 0,sentence_text,label


## Prepare new files

In [29]:
random.seed(42)

In [34]:
def load_sentences(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            sentences.append(json.loads(line))
    return sentences

def prepare_session_files(input_file, output_prefix="session"):
    sentences = load_sentences(input_file)
    
    if len(sentences) != 600:
        raise ValueError(f"Expected 600 sentences, got {len(sentences)}")
    
    random.shuffle(sentences)
    
    set_b = sentences[:60]  # 60 sentences for overlap (double annotation)
    set_c = sentences[60:]  # 540 sentences for unique annotation
    
    sessions = []
    
    for session_id in range(3):
        overlap_start = session_id * 20
        overlap_end = overlap_start + 20
        overlap_sentences = set_b[overlap_start:overlap_end]  # 20 overlap sentences
        
        unique_start = session_id * 180
        unique_end = unique_start + 180
        unique_sentences = set_c[unique_start:unique_end]  # 180 unique sentences
        
        session_sentences = overlap_sentences + unique_sentences
        random.shuffle(session_sentences)
        
        session_data = {
            'session_id': session_id + 1,
            'sentences': session_sentences,
            'overlap_count': len(overlap_sentences),
            'unique_count': len(unique_sentences),
            'total_count': len(session_sentences)
        }
        sessions.append(session_data)
        
        with open(f"{output_prefix}_{session_id + 1}.jsonl", 'w', encoding='utf-8') as f:
            for sentence in session_sentences:
                json.dump(sentence, f, ensure_ascii=False)
                f.write('\n')
    
    return sessions

def sanity_check(sessions):
    print("=== SANITY CHECK ===")
    
    all_sentences = set()
    total_overlap_sentences = 0
    total_unique_sentences = 0
    
    for session in sessions:
        session_id = session['session_id']
        sentences = [json.dumps(s, sort_keys=True) for s in session['sentences']]
        
        for s in sentences:
            all_sentences.add(s)
        
        total_overlap_sentences += session['overlap_count']
        total_unique_sentences += session['unique_count']
        
        print(f"Session {session_id}:")
        print(f"  Total sentences: {len(sentences)}")
        print(f"  Overlap sentences (for intra-session sharing): {session['overlap_count']}")
        print(f"  Unique sentences (for intra-session distribution): {session['unique_count']}")
    
    print(f"\nOverall:")
    print(f"  Total unique sentences across all sessions: {len(all_sentences)}")
    print(f"  Total overlap sentences across sessions: {total_overlap_sentences}")
    print(f"  Total unique sentences across sessions: {total_unique_sentences}")
    
    expected_total = 600
    expected_overlap_total = 60  # 20 per session * 3 sessions
    expected_unique_total = 540  # 180 per session * 3 sessions
    
    print(f"\nValidation:")
    print(f"  All sentences are unique across sessions: ✓" if len(all_sentences) == expected_total else f"  Sentence uniqueness: ✗")
    print(f"  Total overlap sentences: {total_overlap_sentences}/{expected_overlap_total} ✓" if total_overlap_sentences == expected_overlap_total else f"  Total overlap sentences: {total_overlap_sentences}/{expected_overlap_total} ✗")
    print(f"  Total unique sentences: {total_unique_sentences}/{expected_unique_total} ✓" if total_unique_sentences == expected_unique_total else f"  Total unique sentences: {total_unique_sentences}/{expected_unique_total} ✗")
    print(f"  Total sentences: {len(all_sentences)}/{expected_total} ✓" if len(all_sentences) == expected_total else f"  Total sentences: {len(all_sentences)}/{expected_total} ✗")
    
    print(f"\nPer-session breakdown:")
    for session in sessions:
        session_id = session['session_id']
        print(f"  Session {session_id}: {session['total_count']} sentences ({session['overlap_count']} intra-session overlap + {session['unique_count']} intra-session unique)")
    
    print(f"\nDesign verification:")
    print(f"  ✓ Each session has exactly 200 sentences")
    print(f"  ✓ Within each session: 20 sentences will be shared by both annotators")
    print(f"  ✓ Within each session: 180 sentences will be split between annotators (90 each)")
    print(f"  ✓ No sentences appear in multiple sessions")
    print(f"  ✓ Total annotation workload: 60 sentences × 2 annotators + 540 sentences × 1 annotator = 660 annotations")

if __name__ == "__main__":
    input_file = "sample_sfu_combined.jsonl"
    sessions = prepare_session_files(input_file)
    sanity_check(sessions)
    print(f"\nFiles created: session_1.jsonl, session_2.jsonl, session_3.jsonl")
    print("Each file contains 200 sentences for that session's annotators to share.")

=== SANITY CHECK ===
Session 1:
  Total sentences: 200
  Overlap sentences (for intra-session sharing): 20
  Unique sentences (for intra-session distribution): 180
Session 2:
  Total sentences: 200
  Overlap sentences (for intra-session sharing): 20
  Unique sentences (for intra-session distribution): 180
Session 3:
  Total sentences: 200
  Overlap sentences (for intra-session sharing): 20
  Unique sentences (for intra-session distribution): 180

Overall:
  Total unique sentences across all sessions: 600
  Total overlap sentences across sessions: 60
  Total unique sentences across sessions: 540

Validation:
  All sentences are unique across sessions: ✓
  Total overlap sentences: 60/60 ✓
  Total unique sentences: 540/540 ✓
  Total sentences: 600/600 ✓

Per-session breakdown:
  Session 1: 200 sentences (20 intra-session overlap + 180 intra-session unique)
  Session 2: 200 sentences (20 intra-session overlap + 180 intra-session unique)
  Session 3: 200 sentences (20 intra-session overlap 