# Dataset exploration

In this notebook we calculate and plot metrics of the annotated documents, with the objective of describing and summarizing the resulting dataset 

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas
import os
import sys

In [2]:
import seaborn as sns
sns.set_style('white')
sns.set_palette('colorblind')

In [3]:
from collections import Counter

In [32]:
from imp import reload
import read_annotations
reload(read_annotations);

In [33]:
ANNOTATORS = {
    'C': {'dirname': 'C'}, 'L': {'dirname': 'L'}, 'M': {'dirname': 'M'}, 'S': {'dirname': 'S'}
}
ANNOTATION_DIR = os.path.join(os.path.expanduser('~'), 'am/data/echr/annotation/')
documents = read_annotations.get_all_documents(ANNOTATION_DIR, ANNOTATORS)

In [34]:
documents

defaultdict(list,
            {'C': [Case: TALMANE v. LATVIA - Ann: C,
              Case: ALKASI v. TURKEY - Ann: C],
             'L': [Case: B.S. v. SPAIN - Ann: L,
              Case: ALKASI v. TURKEY - Ann: L,
              Case: PERUS v. SLOVENIA - Ann: L],
             'M': [Case: BARCZA AND OTHERS v HUNGARY - Ann: M,
              Case: D.J. v. CROATIA - Ann: M,
              Case: TALMANE v. LATVIA - Ann: M,
              Case: EGITIM VE BILIM EMEKCILERI SENDIKASI v. TURKEY - Ann: M,
              Case: PERUS v. SLOVENIA - Ann: M,
              Case: ALKASI v. TURKEY - Ann: M],
             'S': [Case: PERUS v. SLOVENIA - Ann: S,
              Case: ALKASI v. TURKEY - Ann: S]})

In [70]:
stats_columns = ['Tokens', 'Labeled Tokens', 'Claims', 'Premises', 'Major claims',
                 'Components', 'Relations']
statistics = pandas.DataFrame(
    index=[document.identifier 
           for annotator_documents in documents.values() for document in annotator_documents],
    columns=['Annotator'] + stats_columns)

def get_statistics(document, annotator):
    id = document.identifier
    words, labels = document.get_word_label_list()
    statistics.loc[id]['Annotator'] = annotator
    statistics.loc[id]['Tokens'] = len(words)
    statistics.loc[id]['Labeled Tokens'] = len(
        [label for label in labels if label != document.default_label])
    counts = Counter(labels)
    statistics.loc[id]['Claims'] = counts['claim']
    statistics.loc[id]['Major claims'] = counts['major-claim']
    statistics.loc[id]['Premises'] = counts['premise']
    statistics.loc[id]['Components'] = len(document.annotated_components)
    statistics.loc[id]['Relations'] = len(document.annotated_relations)

In [71]:
for annotator, annotator_documents in documents.items():
    for document in annotator_documents:
        get_statistics(document, annotator)
statistics[stats_columns] = statistics[stats_columns].astype(int)

In [72]:
statistics

Unnamed: 0,Annotator,Tokens,Labeled Tokens,Claims,Premises,Major claims,Components,Relations
Case: TALMANE v. LATVIA - Ann: C,C,2245,1046,241,749,56,34,23
Case: ALKASI v. TURKEY - Ann: C,C,2732,1330,361,879,90,51,43
Case: B.S. v. SPAIN - Ann: L,L,7098,2594,1059,1303,232,89,80
Case: ALKASI v. TURKEY - Ann: L,L,2732,1579,609,868,102,57,48
Case: PERUS v. SLOVENIA - Ann: L,L,3593,1443,394,1001,48,66,48
Case: PERUS v. SLOVENIA - Ann: S,S,3593,1969,583,1321,65,74,49
Case: ALKASI v. TURKEY - Ann: S,S,2732,1352,436,811,105,58,38
Case: BARCZA AND OTHERS v HUNGARY - Ann: M,M,4252,1998,758,1073,167,75,61
Case: D.J. v. CROATIA - Ann: M,M,5994,4349,2156,1847,346,145,124
Case: TALMANE v. LATVIA - Ann: M,M,2245,1244,476,740,28,54,42


In [75]:
statistics.sum()

Annotator         CCLLLSSMMMMMM
Tokens                    46045
Labeled Tokens            22009
Claims                     8385
Premises                  12242
Major claims               1382
Components                  814
Relations                   647
dtype: object

In [74]:
statistics.describe()

Unnamed: 0,Tokens,Labeled Tokens,Claims,Premises,Major claims,Components,Relations
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,3541.923077,1693.0,645.0,941.692308,106.307692,62.615385,49.769231
std,1478.614355,975.182034,513.366016,414.535962,94.135881,31.990984,28.563245
min,2245.0,173.0,128.0,45.0,0.0,6.0,3.0
25%,2732.0,1330.0,394.0,791.0,48.0,51.0,41.0
50%,2732.0,1443.0,476.0,868.0,90.0,57.0,47.0
75%,3593.0,1969.0,730.0,1073.0,105.0,74.0,49.0
max,7098.0,4349.0,2156.0,1847.0,346.0,145.0,124.0


In [73]:
statistics.groupby('Annotator').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Claims,Components,Labeled Tokens,Major claims,Premises,Relations,Tokens
Annotator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C,count,2.0,2.0,2.0,2.0,2.0,2.0,2.0
C,mean,301.0,42.5,1188.0,73.0,814.0,33.0,2488.5
C,std,84.852814,12.020815,200.818326,24.041631,91.923882,14.142136,344.361002
C,min,241.0,34.0,1046.0,56.0,749.0,23.0,2245.0
C,25%,271.0,38.25,1117.0,64.5,781.5,28.0,2366.75
C,50%,301.0,42.5,1188.0,73.0,814.0,33.0,2488.5
C,75%,331.0,46.75,1259.0,81.5,846.5,38.0,2610.25
C,max,361.0,51.0,1330.0,90.0,879.0,43.0,2732.0
L,count,3.0,3.0,3.0,3.0,3.0,3.0,3.0
L,mean,687.333333,70.666667,1872.0,127.333333,1057.333333,58.666667,4474.333333
