# Dataset exploration

In this notebook we calculate and plot metrics of the annotated documents, with the objective of describing and summarizing the resulting dataset 

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas
import os
import sys

In [2]:
import seaborn as sns
sns.set_style('white')
sns.set_palette('colorblind')

In [3]:
from collections import Counter

In [4]:
from imp import reload
import read_annotations
reload(read_annotations);

In [5]:
filenames = read_annotations.get_filenames()
documents = read_annotations.read_annotations(filenames.items())

In [7]:
statistics = pandas.DataFrame(
    index=[document.identifier for annotator_documents in documents.values() for document in annotator_documents],
    columns=[
        'Tokens', 'Labeled Words', 'Claims', 'Premises', 'Major claims',
        'Components', 'Relations'])

def get_statistics(document):
    id = document.identifier
    words, labels = document.get_word_label_list()
    statistics.loc[id]['Tokens'] = len(words)
    statistics.loc[id]['Labeled Words'] = len(
        [label for label in labels if label != document.default_label])
    counts = Counter(labels)
    statistics.loc[id]['Claims'] = counts['claim']
    statistics.loc[id]['Major claims'] = counts['major-claim']
    statistics.loc[id]['Premises'] = counts['premise']
    statistics.loc[id]['Components'] = len(document.annotated_components)
    statistics.loc[id]['Relations'] = len(document.annotated_relations)

In [8]:
for annotator, annotator_documents in documents.items():
    for document in annotator_documents:
        get_statistics(document)

In [9]:
statistics

Unnamed: 0,Tokens,Labeled Words,Claims,Premises,Major claims,Components,Relations
Case: ALKASI v. TURKEY - Ann: L,2811,1488,495,821,172,56,43
Case: ALKASI v. TURKEY - Ann: C,2811,1394,345,955,94,42,34
Case: D.J. v. CROATIA - Ann: M,6135,2801,1788,701,312,90,34
Case: BARCZA AND OTHERS v HUNGARY - Ann: M,4318,2058,762,1127,169,75,56
Case: ALKASI v. TURKEY - Ann: M,2811,1277,363,810,104,51,43
