In [58]:
from IPython.core.display import display, HTML
display(HTML(open('visualization/highlight.css').read()))
display(HTML(open('visualization/highlight.js').read()))

import visualization
from termcolor import colored

from collections import defaultdict
import numpy as np
import spacy
nlp = spacy.load('en')

In [20]:
def load_result(file):
    result = {}
    with open(file, "r") as f:
        for line in f:
            article_id, spl, spr = line.split('\t')
            result.setdefault(article_id, [])
            result[article_id].append([int(spl), int(spr)])
    return result


def show_result(result, articles_id, articles_content, task):
    articles_contents = dict(zip(articles_id, articles_content))
    for article_id in sorted(result):
        text = articles_contents[article_id]
        nlp_text = nlp(text)
        tokens_idx = np.array([token.idx for token in nlp_text])
        tokens = [token.text for token in nlp_text]

        spans = []
        for sp in result[article_id]:
            sp = list(sp)
            sp[0] = np.where(tokens_idx >= sp[0])[0][0]
            sp[1] = np.where(tokens_idx < sp[1])[0][-1]
            spans.append([sp[0], sp[1]])

        if len(spans) != 0:
            print(colored(article_id, 'red'))
            visualization.render(tokens, [spans], task=task)

In [17]:
import pandas as pd
russian_df = pd.read_csv(r'D:\dev\code\jaifp\parrot\data\russian-media-outlets-20220721.csv', nrows=100, index_col=0)
russian_df

Unnamed: 0,headline,publication_date,article_content,article_url,source
0,Ukraine First Former Soviet State in World Cup...,2006-06-21,Thousands of fans in Kiev braved the elements ...,,rt
1,Rosneft begins final step in going public,2006-06-26,The Russian energy company Rosneft says it has...,,rt
2,Arcelor agrees takeover by Mittal to create st...,2006-06-26,Steelmaker Arcelor has yielded to Indian suito...,,rt
3,Israel’s missing soldier: Palestinians meet on...,2006-06-26,Palestinian President Mahmoud Abbas and Hamas ...,,rt
4,Rosneft roadshow fires investor interest,2006-06-27,"Russian energy firm Rosneft, heading for the c...",,rt
...,...,...,...,...,...
95,Civilian death toll nears 400 in Middle East c...,2006-07-24,As Israeli ground troops pushed further into L...,,rt
96,Saddam Hussein in hospital,2006-07-24,Former Iraqi President Saddam Hussein is in ho...,,rt
97,Lebanon aid arrives by sea,2006-07-24,Sea-borne aid supplies have reached Lebanon. F...,,rt
98,Ukraine politics: deadline approches,2006-07-24,The deadline for Ukraine’s parliament to form ...,,rt


In [16]:
with open(r'D:\dev\code\jaifp\semeval2020_task11\models\si_roberta_crf\test_predictions.txt', 'r') as f:
    content = f.read()

['-DOCSTART-\tO',
 'Thousands\tO',
 'of\tO',
 'fans\tO',
 'in\tO',
 'Kiev\tO',
 'braved\tO',
 'the\tO',
 'elements\tO',
 'to\tO',
 'watch\tO',
 'the\tO',
 'game\tO',
 '–\tO',
 'holding\tO',
 'their\tO',
 'collective\tO',
 'breath\tO',
 'before\tO',
 'the\tO',
 'goals\tO',
 'started\tO',
 'rolling\tO',
 'in\tO',
 '.\tO',
 'Passion\tB-PROP',
 'and\tO',
 'joy\tO',
 'met\tO',
 'each\tO',
 'of\tO',
 'the\tO',
 'four\tO',
 'goals\tO',
 ',\tO',
 'and\tO',
 'specially\tO',
 'the\tO',
 'final\tO',
 'whistle\tO',
 '.\tO',
 'The\tO',
 'fact\tO',
 'that\tO',
 'apart\tO',
 'from\tO',
 'Russia\tO',
 ',\tO',
 'Ukraine\tO',
 'was\tO',
 'the\tO',
 'first\tO',
 'former\tO',
 'Soviet\tO',
 'republic\tO',
 'to\tO',
 'qualify\tO',
 'for\tO',
 'the\tO',
 'World\tO',
 'Cup\tO',
 'added\tO',
 'to\tO',
 'the\tO',
 'wild\tB-PROP',
 'celebrations\tO',
 '.\tO',
 '-DOCSTART-\tO',
 'The\tO',
 'Russian\tO',
 'energy\tO',
 'company\tO',
 'Rosneft\tO',
 'says\tO',
 'it\tO',
 'has\tO',
 'already\tO',
 'received\tO',
 '

In [54]:
from typing import List


def predictions_to_span(path=r'D:\dev\code\jaifp\semeval2020_task11\models\si_roberta_crf\test_predictions.txt'):
    def span_ends(_line):
        return not _line or not _line.startswith('I-PROP')

    def document_ends(_next_line):
        return not _next_line or _next_line.startswith("-DOCSTART-")

    def parse_line(_line):
        split_line = _line.split('\t')
        if len(split_line) < 2:
            raise Exception("Unexpected format")
        token = split_line[0]
        prediction = split_line[1].strip()
        return token, prediction

    with open(path, 'r', encoding='utf-8') as f:
        lines: List[str] = [file_line for file_line in f.readlines()]

    all_contents: List[List[str]] = []
    all_spans = []

    current_contents: List[str] = []
    current_spans = []

    within_span = False
    span_start = -1

    doc_id = 0
    token_counter = 0
    for i, line in enumerate(lines):
        next_line = lines[i + 1] if i + 1 < len(lines) else None
        if not line or line.startswith("-DOCSTART-"):
            continue
        else:
            token, prediction = parse_line(line)
            if prediction.startswith('B-PROP'):
                within_span = True
                span_start = token_counter

            if within_span and span_ends(next_line):
                within_span = False
                span_end = token_counter  # exclusive range
                current_spans.append((span_start, span_end))

            current_contents.append(token)
            token_counter += 1

        # document ends here
        if document_ends(next_line):
            import numpy as np
            lengths = np.cumsum([0] + list(map(len, current_contents)))
            lengths += np.asarray(range(0, len(lengths)))

            all_contents.append(current_contents)
            all_spans.append(current_spans)
            doc_id += 1
            token_counter = 0
            current_contents = []
            current_spans = []

    return all_contents, all_spans

In [55]:
tokens, spans = predictions_to_span()
tokens

[['Thousands',
  'of',
  'fans',
  'in',
  'Kiev',
  'braved',
  'the',
  'elements',
  'to',
  'watch',
  'the',
  'game',
  '–',
  'holding',
  'their',
  'collective',
  'breath',
  'before',
  'the',
  'goals',
  'started',
  'rolling',
  'in',
  '.',
  'Passion',
  'and',
  'joy',
  'met',
  'each',
  'of',
  'the',
  'four',
  'goals',
  ',',
  'and',
  'specially',
  'the',
  'final',
  'whistle',
  '.',
  'The',
  'fact',
  'that',
  'apart',
  'from',
  'Russia',
  ',',
  'Ukraine',
  'was',
  'the',
  'first',
  'former',
  'Soviet',
  'republic',
  'to',
  'qualify',
  'for',
  'the',
  'World',
  'Cup',
  'added',
  'to',
  'the',
  'wild',
  'celebrations',
  '.'],
 ['The',
  'Russian',
  'energy',
  'company',
  'Rosneft',
  'says',
  'it',
  'has',
  'already',
  'received',
  'applications',
  'for',
  'a',
  'sizeable',
  'portion',
  'of',
  'shares',
  ',',
  'with',
  'major',
  'market',
  'players',
  'applying',
  'for',
  'stock',
  'priced',
  'between',
  '$',

In [60]:
for t, s in zip(tokens, spans):
    if not s:
        continue
    visualization.render(t, [s], task='SI')