In [4]:
import os
import json
from pathlib import Path
from typing import Optional, Any, Iterator, Tuple, List
import GatenlpUtils
import re
import urllib.parse

 ### Load all Gate documents

In [27]:
corpus = GatenlpUtils.loadCorpus()


Loaded input/updated/annotated\dev\CASE OF ALTAY v. TURKEY (No. 2).xml into corpus
Loaded input/updated/annotated\dev\CASE OF BELYAYEV AND OTHERS v. UKRAINE.xml into corpus
Loaded input/updated/annotated\dev\CASE OF BIGUN v. UKRAINE.xml into corpus
Loaded input/updated/annotated\test\CASE OF CABUCAK v. GERMANY.xml into corpus
Loaded input/updated/annotated\test\CASE OF CAN v. TURKEY.xml into corpus
Loaded input/updated/annotated\test\CASE OF CRISTIAN CATALIN UNGUREANU v. ROMANIA.xml into corpus
Loaded input/updated/annotated\train\CASE OF DOKTOROV v. BULGARIA.xml into corpus
Loaded input/updated/annotated\train\CASE OF EGILL EINARSSON v. ICELAND (No. 2).xml into corpus
Loaded input/updated/annotated\train\CASE OF HOINESS v. NORWAY.xml into corpus
Loaded input/updated/annotated\train\CASE OF KOSAITE - CYPIENE AND OTHERS v. LITHUANIA.xml into corpus
Loaded input/updated/annotated\train\CASE OF LOZOVYYE v. RUSSIA.xml into corpus
Loaded input/updated/annotated\train\CASE OF M.T. v. UKRAINE

 ### Functions

In [6]:
class DocumentManager:
    def __init__(self, folder_path: str):
        self.folder_path = Path(folder_path)
        self.loaded_documents = {}
        self.exclude_files = ['llm_evaluation_results.json']
        self.exclude_prefixes = ['pipeline_results_']
    
    def load_all_documents(self):
        """Load all document files"""
        self.loaded_documents = {}
        
        for json_file in self.folder_path.glob("*.json"):
            filename = json_file.name
            
            if self._should_exclude(filename):
                continue
                
            try:
                with json_file.open('r', encoding='utf-8') as file:
                    self.loaded_documents[filename] = json.load(file)
                print(f"Loaded: {filename}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")
        
        return self.loaded_documents
    
    def get_document(self, filename: str) -> Optional[Any]:
        """Get a specific document by filename"""
        
        # First check if it's already loaded
        if filename in self.loaded_documents:
            return self.loaded_documents[filename]
        
        # If not loaded, try to load it directly
        return self._load_single_document(filename)
    
    def _load_single_document(self, filename: str) -> Optional[Any]:
        """Load a single document file"""
        
        if self._should_exclude(filename):
            print(f"File '{filename}' is excluded from loading")
            return None
        
        file_path = self.folder_path / filename
        
        if not file_path.exists():
            print(f"File not found: {filename}")
            return None
        
        try:
            with file_path.open('r', encoding='utf-8') as file:
                data = json.load(file)
            
            # Cache the loaded document
            self.loaded_documents[filename] = data
            print(f"Loaded: {filename}")
            return data
            
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            return None
    
    def _should_exclude(self, filename: str) -> bool:
        """Check if file should be excluded"""
        if filename in self.exclude_files:
            return True
        
        for prefix in self.exclude_prefixes:
            if filename.startswith(prefix):
                return True
        
        return False
    
    def list_available_documents(self):
        """List all available document files"""
        available = []
        
        for json_file in self.folder_path.glob("*.json"):
            filename = json_file.name
            if not self._should_exclude(filename):
                available.append(filename)
        
        return sorted(available)
    
    def is_loaded(self, filename: str) -> bool:
        """Check if a document is already loaded"""
        return filename in self.loaded_documents
    
    # Iterator methods
    def __iter__(self) -> Iterator[Tuple[str, Any]]:
        """Make DocumentManager iterable - returns (filename, content) tuples"""
        return iter(self.loaded_documents.items())
    
    def __len__(self) -> int:
        """Return number of loaded documents"""
        return len(self.loaded_documents)
    
    def __contains__(self, filename: str) -> bool:
        """Check if filename is in loaded documents"""
        return filename in self.loaded_documents
    
    def __getitem__(self, filename: str) -> Any:
        """Allow bracket notation access"""
        return self.loaded_documents[filename]
    
    def keys(self):
        """Return document filenames"""
        return self.loaded_documents.keys()
    
    def values(self):
        """Return document contents"""
        return self.loaded_documents.values()
    
    def items(self):
        """Return (filename, content) pairs"""
        return self.loaded_documents.items()
    
    def iter_filenames(self) -> Iterator[str]:
        """Iterate over just the filenames"""
        return iter(self.loaded_documents.keys())
    
    def iter_documents(self) -> Iterator[Any]:
        """Iterate over just the document contents"""
        return iter(self.loaded_documents.values())


"""
# Example usage
doc_manager = DocumentManager("output/pipeline_results_20250808_145025")

# Option 1: Load all documents first, then access
doc_manager.load_all_documents()
specific_doc = doc_manager.get_document("documentname.json")

# Option 2: Load specific document on demand
specific_doc = doc_manager.get_document("another_document.json")

# List available documents
available_docs = doc_manager.list_available_documents()
print("Available documents:", available_docs)

# Check if a document is loaded
print("Is loaded:", doc_manager.is_loaded("documentname.json"))

# Example usage
doc_manager = DocumentManager("pipeline_results_20250808_145025")
doc_manager.load_all_documents()

# Method 1: Direct iteration (returns filename, content tuples)
print("=== Method 1: Direct iteration ===")
for filename, content in doc_manager:
    print(f"Processing: {filename}")
    print(f"Content type: {type(content)}")
    if isinstance(content, dict):
        print(f"Keys: {list(content.keys())}")
    print("-" * 50)

# Method 2: Iterate over filenames only
print("\n=== Method 2: Filenames only ===")
for filename in doc_manager.keys():
    print(f"Document: {filename}")

# Method 3: Iterate over contents only
print("\n=== Method 3: Contents only ===")
for i, content in enumerate(doc_manager.values(), 1):
    print(f"Document {i}: {type(content)}")

# Method 4: Using items() method
print("\n=== Method 4: Using items() ===")
for filename, content in doc_manager.items():
    print(f"File: {filename}, Size: {len(str(content))}")

# Method 5: Check membership and length
print(f"\nTotal documents: {len(doc_manager)}")
print(f"Contains 'documentname.json': {'documentname.json' in doc_manager}")

# Method 6: Bracket notation access
if 'documentname.json' in doc_manager:
    doc = doc_manager['documentname.json']
    print(f"Accessed document via bracket notation: {type(doc)}")
"""


'\n# Example usage\ndoc_manager = DocumentManager("output/pipeline_results_20250808_145025")\n\n# Option 1: Load all documents first, then access\ndoc_manager.load_all_documents()\nspecific_doc = doc_manager.get_document("documentname.json")\n\n# Option 2: Load specific document on demand\nspecific_doc = doc_manager.get_document("another_document.json")\n\n# List available documents\navailable_docs = doc_manager.list_available_documents()\nprint("Available documents:", available_docs)\n\n# Check if a document is loaded\nprint("Is loaded:", doc_manager.is_loaded("documentname.json"))\n\n# Example usage\ndoc_manager = DocumentManager("pipeline_results_20250808_145025")\ndoc_manager.load_all_documents()\n\n# Method 1: Direct iteration (returns filename, content tuples)\nprint("=== Method 1: Direct iteration ===")\nfor filename, content in doc_manager:\n    print(f"Processing: {filename}")\n    print(f"Content type: {type(content)}")\n    if isinstance(content, dict):\n        print(f"Keys

In [7]:
def analyze_events_by_model(json_data):
    """Analyze events across all models with statistics"""
    
    model_stats = {}
    all_events = []
    
    for annotation in json_data.get("annotations", []):
        model_name = annotation.get("model_name", "Unknown")
        events = annotation.get("events", [])
        
        # Collect statistics
        model_stats[model_name] = {
            'event_count': len(events),
            'runtime_seconds': annotation.get('model_runtime_seconds', 0),
            'processed_at': annotation.get('processed_at', 'Unknown'),
            'validation_attempts': annotation.get('validation_attempts', 0)
        }
        
        # Process events
        for event in events:
            event_data = {
                'model': model_name,
                'event': event.get('event', ''),
                'who': event.get('event_who', ''),
                'when': event.get('event_when', ''),
                'what': event.get('event_what', ''),
                'type': event.get('event_type', ''),
                'source_length': len(event.get('source_text', ''))
            }
            all_events.append(event_data)
    
    # Print statistics
    print("=== Model Statistics ===")
    for model, stats in model_stats.items():
        print(f"\nModel: {model}")
        print(f"  Events: {stats['event_count']}")
        print(f"  Runtime: {stats['runtime_seconds']:.2f}s")
        print(f"  Validation attempts: {stats['validation_attempts']}")
    
    # Print event type distribution
    event_types = {}
    for event in all_events:
        event_type = event['type'] or 'Unknown'
        event_types[event_type] = event_types.get(event_type, 0) + 1
    
    print(f"\n=== Event Type Distribution ===")
    for event_type, count in sorted(event_types.items()):
        print(f"{event_type}: {count}")
    
    return model_stats, all_events


In [8]:
def extract_document_name_flexible(document_path):
    """More flexible extraction that handles path variations"""
    
    # Decode URL encoding
    decoded_path = urllib.parse.unquote(document_path)
    
    # Pattern breakdown:
    # ^file:/.*?/CASE\s+OF\s+ - matches file:/ + any path + "CASE OF "
    # (.+?) - captures the document name (non-greedy)
    # (?:\.[^.]+)?$ - optional file extension at the end
    pattern = r'^file:/.*?/CASE\s+OF\s+(.+?)(?:\.[^.]+)?$'
    
    match = re.search(pattern, decoded_path, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    
    return None


In [9]:
def get_gate_document_by_name_from_corpus(corpus, spec_doc):
    for i, doc in enumerate(corpus):
        gate_name = doc.features.get("gate.SourceURL", f"Document {i+1}")
        gate_name = extract_document_name_flexible(gate_name)
        spec_name = extract_document_name_flexible(spec_doc.get("Document", ""))
        if gate_name == spec_name:
            print(f"Matched document GATE Corpus: {gate_name}, Result: {spec_name}")
            return doc, gate_name
    return None



In [10]:
def gate_contains_source_text(gate_text, source_text):
    """Basic case-sensitive containment check"""
    return source_text in gate_text


In [21]:
def find_text_position(gate_text, source_text):
        """Find the start and end positions of source_text within full_text. Only finds the first match!"""
        # Try exact match first
        start = gate_text.find(source_text)
        if start != -1:
            return (start, start + len(source_text))
        
        # Try with normalized whitespace
        normalized_source = ' '.join(source_text.split())
        normalized_full = ' '.join(gate_text.split())

        start = normalized_full.find(normalized_source)
        if start != -1:
            # Convert back to original text positions
            words_before = len(normalized_full[:start].split())
            original_words = gate_text.split()
            
            if words_before <= len(original_words):
                # Find start in original text
                start_pos = 0
                for i in range(words_before):
                    start_pos = gate_text.find(original_words[i], start_pos) + len(original_words[i])

                # Find the actual start of the matching text
                remaining_text = gate_text[start_pos:]
                actual_start = remaining_text.find(source_text.strip())
                if actual_start != -1:
                    return (start_pos + actual_start, start_pos + actual_start + len(source_text.strip()))
        
        return None


In [12]:
def find_all_text_positions(gate_text: str, source_text: str) -> List[Tuple[int, int]]:
    """
    Simplified version that finds all text positions using different matching strategies. Returns all positions with boundaries.
    """
    all_positions = []
    
    # Strategy 1: Exact matches
    start = 0
    while True:
        pos = gate_text.find(source_text, start)
        if pos == -1:
            break
        all_positions.append((pos, pos + len(source_text)))
        start = pos + 1
    
    # # Strategy 2: Case-insensitive matches (only if no exact matches)
    # if not all_positions:
    #     start = 0
    #     text_lower = gate_text.lower()
    #     source_lower = source_text.lower()
    #     while True:
    #         pos = text_lower.find(source_lower, start)
    #         if pos == -1:
    #             break
    #         all_positions.append((pos, pos + len(source_text)))
    #         start = pos + 1
    
    # # Strategy 3: Regex word boundary matches
    # if not all_positions:
    #     import re
    #     pattern = r'\b' + re.escape(source_text) + r'\b'
    #     for match in re.finditer(pattern, gate_text, re.IGNORECASE):
    #         all_positions.append((match.start(), match.end()))
    
    # # Strategy 4: Normalized whitespace (only if still no matches)
    # if not all_positions:
    #     all_positions.extend(_find_with_normalized_whitespace(gate_text, source_text))
    
    # # Remove duplicates and sort
    unique_positions = list(set(all_positions))
    unique_positions.sort()
    
    return unique_positions


In [13]:
def is_within_boundaries(source_boundaries, text_boundaries):
    """
    Check if any text boundary is completely within source boundaries.
    
    Args:
        source_boundaries: tuple (start, end) - e.g., (1, 10)
        text_boundaries: list of tuples [(start, end), ...] - e.g., [(2, 5), (8, 12)]
    
    Returns:
        tuple: (bool, list) - (True if any boundary is within, list of boundaries that are within)
    """
    source_start, source_end = source_boundaries
    within_boundaries = []
    
    for text_start, text_end in text_boundaries:
        # Check if text boundary is completely within source boundary
        if source_start <= text_start and text_end <= source_end:
            within_boundaries.append((text_start, text_end))
    
    return bool(within_boundaries), within_boundaries


In [14]:
def check_boundary_relationships(source_boundaries, text_boundaries):
    """
    Check various types of relationships between boundaries.
    
    Returns:
        dict with different relationship types
    """
    source_start, source_end = source_boundaries
    
    results = {
        'completely_within': [],      # Text boundary completely inside source
        'completely_contains': [],    # Text boundary completely contains source
        'overlaps': [],              # Any kind of overlap
        'starts_within': [],         # Text starts within source but extends beyond
        'ends_within': [],           # Text ends within source but starts before
        'no_overlap': []             # No overlap at all
    }
    
    for i, (text_start, text_end) in enumerate(text_boundaries):
        boundary_info = {
            'index': i,
            'boundary': (text_start, text_end)
        }
        
        # Check if text boundary is completely within source
        if source_start <= text_start and text_end <= source_end:
            results['completely_within'].append(boundary_info)
        
        # Check if text boundary completely contains source
        elif text_start <= source_start and source_end <= text_end:
            results['completely_contains'].append(boundary_info)
        
        # Check if text starts within source but extends beyond
        elif source_start <= text_start < source_end < text_end:
            results['starts_within'].append(boundary_info)
        
        # Check if text ends within source but starts before
        elif text_start < source_start < text_end <= source_end:
            results['ends_within'].append(boundary_info)
        
        # Check for any overlap
        elif not (text_end <= source_start or text_start >= source_end):
            results['overlaps'].append(boundary_info)
        
        # No overlap
        else:
            results['no_overlap'].append(boundary_info)
    
    return results

"""
# Example usage
source_boundaries = (5, 15)
text_boundaries = [(1, 3), (3, 8), (7, 12), (10, 20), (6, 9), (18, 25)]

relationships = check_boundary_relationships(source_boundaries, text_boundaries)

for relationship_type, boundaries in relationships.items():
    if boundaries:
        print(f"{relationship_type}: {[b['boundary'] for b in boundaries]}")
"""


'\n# Example usage\nsource_boundaries = (5, 15)\ntext_boundaries = [(1, 3), (3, 8), (7, 12), (10, 20), (6, 9), (18, 25)]\n\nrelationships = check_boundary_relationships(source_boundaries, text_boundaries)\n\nfor relationship_type, boundaries in relationships.items():\n    if boundaries:\n        print(f"{relationship_type}: {[b[\'boundary\'] for b in boundaries]}")\n'

 ### All documents

In [15]:
doc_manager = DocumentManager("output/pipeline_results_20250808_145025")
output_folder = f"{doc_manager.folder_path}/annotated_gate_documents"


 ### Specific document

In [16]:
doc_result = doc_manager.get_document("ALTAY v. TURKEY (No. 2).json")
doc_gate, gate_name = get_gate_document_by_name_from_corpus(corpus, doc_result)


Loaded: ALTAY v. TURKEY (No. 2).json
Matched document GATE Corpus: ALTAY v. TURKEY (No. 2), Result: ALTAY v. TURKEY (No. 2)


In [17]:
analyze_events_by_model(doc_result)


=== Model Statistics ===

Model: gemma3:1b
  Events: 24
  Runtime: 45.81s
  Validation attempts: 1

Model: gemma3:4b
  Events: 25
  Runtime: 66.23s
  Validation attempts: 1

Model: gemma3:12b
  Events: 17
  Runtime: 62.84s
  Validation attempts: 1

Model: llama3.3:latest
  Events: 45
  Runtime: 7929.43s
  Validation attempts: 1

Model: deepseek-r1:8b
  Events: 14
  Runtime: 74.94s
  Validation attempts: 1

Model: mistral:latest
  Events: 53
  Runtime: 133.51s
  Validation attempts: 1

Model: incept5/llama3.1-claude:latest
  Events: 27
  Runtime: 83.36s
  Validation attempts: 1

Model: chevalblanc/claude-3-haiku:latest
  Events: 25
  Runtime: 169.05s
  Validation attempts: 1

Model: llama4:16x17b
  Events: 3
  Runtime: 544.53s
  Validation attempts: 1

Model: mixtral:8x7b
  Events: 17
  Runtime: 2594.74s
  Validation attempts: 1

Model: dolphin3:8b
  Events: 45
  Runtime: 102.32s
  Validation attempts: 1

Model: dolphin-mixtral:8x7b
  Events: 23
  Runtime: 3129.23s
  Validation attempts

({'gemma3:1b': {'event_count': 24,
   'runtime_seconds': 45.810359954833984,
   'processed_at': '2025-08-13T23:45:35.440081',
   'validation_attempts': 1},
  'gemma3:4b': {'event_count': 25,
   'runtime_seconds': 66.233327627182,
   'processed_at': '2025-08-13T23:46:41.673480',
   'validation_attempts': 1},
  'gemma3:12b': {'event_count': 17,
   'runtime_seconds': 62.83549976348877,
   'processed_at': '2025-08-13T23:47:44.509051',
   'validation_attempts': 1},
  'llama3.3:latest': {'event_count': 45,
   'runtime_seconds': 7929.431170225143,
   'processed_at': '2025-08-14T01:59:53.941840',
   'validation_attempts': 1},
  'deepseek-r1:8b': {'event_count': 14,
   'runtime_seconds': 74.93521213531494,
   'processed_at': '2025-08-14T02:01:08.877130',
   'validation_attempts': 1},
  'mistral:latest': {'event_count': 53,
   'runtime_seconds': 133.50513553619385,
   'processed_at': '2025-08-14T02:03:22.382346',
   'validation_attempts': 1},
  'incept5/llama3.1-claude:latest': {'event_count': 2

In [18]:
print(find_text_position(doc_gate.text, "application"))
print(find_all_text_positions(doc_gate.text, "the prison administration"))


(774, 785)
[(3622, 3647), (3921, 3946), (4313, 4338), (5049, 5074), (22450, 22475), (31148, 31173), (45603, 45628), (60705, 60730), (62874, 62899)]


In [24]:
for annotation in doc_result.get("annotations", []):
    model_name = annotation.get("model_name", "Unknown")
    events = annotation.get("events", [])
    
    print(f"\n=== Events for model: {model_name} ===")

    # create annotation set for model (and remove duplicates/existing ones)
    annset = doc_gate.annset(model_name)
    annset.clear() 

    if model_name == "gemma3:12b" or model_name == "llama3.3:latest":
        for event in events:
            # print(f"\nEvent:")
            # print(f"  Source text: {event.get('source_text', '')}")
            # print(f"  Who: {event.get('event_who', '')}")
            # print(f"  When: {event.get('event_when', '')}")
            # print(f"  What: {event.get('event_what', '')}")
            # print(f"  Type: {event.get('event_type', '')}")
            #print(f"  Source length: {len(event.get('source_text', ''))}")
            #print(f"  Full Event Data: {json.dumps(event, indent=2)}")

            event_text = event.get('event', '')
            event_what = event.get('event_what', '')
            event_who = event.get('event_who', '')
            event_when = event.get('event_when', '')
            event_type = event.get('event_type', '')
            source_text = event.get('source_text', '')
            source_contained = gate_contains_source_text(doc_gate.text, source_text)

            if event_who == "the prison administration":
                print("he!")
            print(f"***************************************************")
            if source_contained:
                print(f"  Source text found in GATE document: {source_text}")
                # find boundaries of source text
                source_text_boundaries = find_text_position(doc_gate.text, source_text)
                print(f"  Source text boundaries: {source_text_boundaries}")
                # Event
                if event_text != "":
                    event_boundaries = find_all_text_positions(doc_gate.text, event_text)
                    print(f"Event: {event_text}")
                    print(f"Event boundaries: {event_boundaries}")
                    is_within, boundaries_within = is_within_boundaries(source_text_boundaries, event_boundaries)
                    if is_within:
                        print(f"  Event boundaries are within source boundaries. Adding annotation with boundaries: {boundaries_within}")
                        features = {"type": event_type}
                        annset.add(boundaries_within[0][0], boundaries_within[0][1], "Event", features)
                    else:
                        print(f"  Event boundaries are NOT within source boundaries.")
                # What
                if event_what != "":
                    what_boundaries = find_all_text_positions(doc_gate.text, event_what)
                    print(f"What: {event_what}")
                    print(f"What boundaries: {what_boundaries}")
                    is_within, boundaries_within = is_within_boundaries(source_text_boundaries, what_boundaries)
                    if is_within:
                        print(f"  What boundaries are within source boundaries. Adding annotation with boundaries: {boundaries_within}")
                        features = {}
                        annset.add(boundaries_within[0][0], boundaries_within[0][1], "What", features)
                    else:
                        print(f"  What boundaries are NOT within source boundaries.")
                # When
                if event_when != "":
                    when_boundaries = find_all_text_positions(doc_gate.text, event_when)
                    print(f"When: {event_when}")
                    print(f"When boundaries: {when_boundaries}")
                    is_within, boundaries_within = is_within_boundaries(source_text_boundaries, when_boundaries)
                    if is_within:
                        print(f"  When boundaries are within source boundaries. Adding annotation with boundaries: {boundaries_within}")
                        features = {}
                        annset.add(boundaries_within[0][0], boundaries_within[0][1], "When", features)
                    else:
                        print(f"  When boundaries are NOT within source boundaries.")
                # Who
                if event_who != "":
                    who_boundaries = find_all_text_positions(doc_gate.text, event_who)
                    print(f"Who: {event_who}")
                    print(f"Who boundaries: {who_boundaries}")
                    is_within, boundaries_within = is_within_boundaries(source_text_boundaries, who_boundaries)
                    if is_within:
                        print(f"  Who boundaries are within source boundaries. Adding annotation with boundaries: {boundaries_within}")
                        features = {}
                        annset.add(boundaries_within[0][0], boundaries_within[0][1], "Who", features)
                    else:
                        print(f"  Who boundaries are NOT within source boundaries.")
            else:
                print(f"  Source text NOT found in GATE document: {source_text}")

    # Create output directory if it doesn't exist
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    # Save document in bdocjs format
    output_filename = f"{gate_name}.bdocjs"
    output_path = os.path.join(output_folder, output_filename)

    doc_gate.save(output_path, fmt="bdocjs")
    print(f"✓ Saved: {gate_name}.bdocjs")




=== Events for model: gemma3:1b ===
✓ Saved: ALTAY v. TURKEY (No. 2).bdocjs

=== Events for model: gemma3:4b ===
✓ Saved: ALTAY v. TURKEY (No. 2).bdocjs

=== Events for model: gemma3:12b ===
***************************************************
  Source text found in GATE document: The case originated in an application (no. 11236/09) against the Republic of Turkey lodged with the Court under Article 34 of the Convention for the Protection of Human Rights and Fundamental Freedoms (“the Convention”) by a Turkish national, Mr Mehmet Aytunç Altay (“the applicant”), on 17 February 2006.
  Source text boundaries: (748, 1053)
Event: lodged an application
Event boundaries: [(6811, 6832), (8014, 8035), (8908, 8929)]
  Event boundaries are NOT within source boundaries.
What: application
What boundaries: [(774, 785), (1640, 1651), (1986, 1997), (2154, 2165), (2190, 2201), (2786, 2797), (5022, 5033), (6821, 6832), (6985, 6996), (7201, 7212), (7443, 7454), (7784, 7795), (8024, 8035), (8160, 8171), (

In [None]:
from gatenlp.visualization import CorpusViewer
viewer = CorpusViewer(corpus)
viewer.show()

HBox(children=(Button(icon='arrow-left', layout=Layout(width='5em'), style=ButtonStyle()), IntSlider(value=0, …