In [200]:
import json

data_file = "data_en/sentences_done.json"

with open(data_file, 'rt') as fh:
    data = json.load(fh)
    
print('total sentences:', len(data))

sents_done = [sent for sent in data if sent['excluded'] is False]
print('sentences with annotations:', len(sents_done))


total sentences: 563
sentences with annotations: 338


## Basic Functions

In [245]:
from collections import Counter
from collections import defaultdict


def get_annotator_freq(sents):
    annotator_freq = Counter()
    for sent in sents:
        annotator_freq.update([anno['annotator'] for anno in sent['annotations']])
    return annotator_freq
            

def filter_annotators_by_freq(sents, min_freq=-1, max_freq=-1):
    annotator_freq = get_annotator_freq(sents)
    for sent in sents:
        for anno in sent['annotations']:
            freq = annotator_freq[anno['annotator']]
            if freq < min_freq:
                continue
            elif max_freq > 0 and freq > max_freq:
                continue
            else:
                yield anno


def get_scale_rating(anno, scale):
    if anno['unanswerable']:
        return 'UN'
    elif anno['no_impact']:
        return 0
    else:
        return int(anno[scale])




## Analysing Unanswerable Sentences

In [243]:
scales = [
    'emotional_scale', 'style_scale', 'reflection_scale', 'narrative_scale',
    'surprise_scale', 'attention_scale', 'negative_scale', 'humor_scale'    
]


for sent in data:
    if sent['excluded'] is True:
        continue
    print_sent = False
    for anno in sent['annotations']:
        if anno['unanswerable']:
            print_sent = True
    if not print_sent:
        continue
    print('Sentences:', sent['text'].strip())
    for anno in sent['annotations']:
        if anno['unanswerable']:
            print('\t', anno['annotator'], 'unanswerable')
        elif anno['no_impact']:
            print('\t', anno['annotator'], 'no_impact')
        else:
            print('\t', anno['annotator'], [anno[scale] for scale in scales])


Sentences: It's a feel good book, so that lessens as time goes on.
	 aeq969 unanswerable
	 nuz535 [4, 0, 0, 0, 0, 0, 1, 0]
	 sdu514 [3, 1, 0, 3, 0, 3, 2, 0]
Sentences: then it just spiralled out of control.
	 aeq969 unanswerable
	 nuz535 [2, 0, 0, 0, 4, 0, 2, 0]
	 sdu514 [2, 0, 0, 2, 3, 0, 2, 0]
Sentences: And when Walter wants to help, he throws himself into it, heart, mind, time and bank account.
	 npo717 unanswerable
	 ydv934 ['0', '0', '3', '0', '0', '1', '0', '0']
	 upx953 no_impact
Sentences: A detective of paranormal happenings.
	 npo717 unanswerable
	 ydv934 ['0', '0', '1', '0', '1', '0', '0', '0']
	 upx953 no_impact
Sentences: And then you'll give me the '2 months later' crap next chapter wherein he's instantly recovered from a brain tumor.
	 ifw202 ['0', '0', '0', '0', '2', '3', '3', '0']
	 yaq494 unanswerable
	 nhf461 ['0', '0', '1', '0', '0', '1', '2', '0']
Sentences: Well it takes others a little longer.
	 ifw202 ['2', '3', '3', '2', '2', '2', '3', '2']
	 yaq494 ['0', '0',

Above each sentence with at least one annotation of `unanswerable` is shown, together with the annotations by the annotators.

#### Observations

- For some sentences, all or most annotators agree that the sentence is uninterpretable, therefore, that the questions are unanswerable. 
    - Of these sentences, some are indeed hard to interpret. That is, it could be that the sentence expresses some form of reading impact, but you probably need to see the preceding sentences in the review to figure that out. So you can't say there is no impact, but neither can you say that there is. Some examples:
        - _also check marriage and birth certificates after reading joelle's diaries \\ no follow up with boyfriend \\ possibility of fingerprints on finger bandage \\ buy a horse?_
        - _Yes, I already know this and live it._
        - _I acknowledge my own place in a broken system and my duty to help give others the tools to educate themselves and others through reliable sources._
        - _:)_
    - Others seems to be descriptive sentences, i.e. describing some aspect of the story. Here the unanswerable annotation seem to express that there is no impact. Some examples:
        - _She responds with "Okay daddy._
        - _She goes to Omega Point and Kenji suddenly appears_
        
#### Findings

The `unanswerable` annotations seem to be a mixed bag of interpretations, with examples that are genuinely uninterpretable, others where annotators seem to signal _no impact_ instead. However, it is difficult to determine if an annotator intended `no_impact` or actually considered the sentence to be uninterpretable. 



## Rating Tendencies of Individual Annotators

In [237]:
annotator_freq = get_annotator_freq(sents_done)

anno_freq = defaultdict(Counter)

for sent in sents_done:
    for anno in sent['annotations']:
        if anno['unanswerable']:
            anno_freq[anno['annotator']].update(['unanswerable'])
        elif anno['no_impact']:
            anno_freq[anno['annotator']].update(['no_impact'])
        else:
            anno_freq[anno['annotator']].update([int(anno[scale]) for scale in scales])
            
print('Total: total number of sentences annotated')
print('ANS: number of sentences answered')
print('UNANS: number of sentences that could not be answered')
print('NO_IM: number of sentences labelled as having no impact')
print()
print(f'Annotator    Total ANS UNANS NO_IM\t {" ".join([f"{score: >3}" for score in range(0,5)])}')
print('------------------------------------------------------------')
for annotator, freq in annotator_freq.most_common():
    scores = [anno_freq[annotator][rating] for rating in range(0,5)]
    score_string = ' '.join([f"{score: >3}" for score in scores])
    unanswerable = anno_freq[annotator]['unanswerable']
    answerable = annotator_freq[annotator] - unanswerable
    answer_string = f"{answerable: >6}{unanswerable: >6}{anno_freq[annotator]['no_impact']: >6}"
    print(f'{annotator: <12}', f"{annotator_freq[annotator]: >2}", answer_string, '\t', score_string)
    
    
    

Total: total number of sentences annotated
ANS: number of sentences answered
UNANS: number of sentences that could not be answered
NO_IM: number of sentences labelled as having no impact

Annotator    Total ANS UNANS NO_IM	   0   1   2   3   4
------------------------------------------------------------
cru076       80     74     6    32 	 256  25  14   4  37
ovk109       59     59     0    26 	 195  12  16  28  13
hgx670       50     46     4    19 	 164   2  10  20  20
lbp656       50     50     0    17 	 189  20  22  22  11
war861       47     47     0     9 	 242   2   9   2  49
sdu514       40     40     0     5 	 184   7  28  31  30
naz523       40     40     0    13 	 161   8  18   9  20
vlh718       38     17    21     0 	 102   0   0   0  34
nrb271       30     28     2    16 	  63   4  10  12   7
uiq372       30     30     0     6 	 121  24  11  21  15
eac355       30     27     3    10 	 102   9  12   5   8
nhf461       20     12     8     0 	  54  13  18   6   5
der817     

#### Observations

- Annotators tend to do complete batches of 10 sentences, most do one batch, some do multiple. Few annotators do only part of a batch.
- The annotators who do more than 10 sentences, use `no_impact` more often than `unanswerable`. The `no_impact` option translates to a score of 0 for all impact scales.
    - Some annotators seem to use the `unanswerable` option for both uninterpretable sentences and sentences that express no reading impact. E.g. annotators _vlh718_ and _nhf461_ never use `no_impact` and only use `unanswerable`.
    - Vice versa, some seem to only use `no_impact`, e.g. _ovk109_ and _lbp656_. It is possible that they feel comfortable interpreting the review author's intention without any further context. Given that `no_impact` is more common than `unanswerable`, it is also possible that annotators who only use `no_impact` only encountered sentences they felt confident about interpreting.
- There are big differences in rating behaviour when annotators do assign ratings. 
    - Some tend to rate 'all or nothing' with the majority of ratings being 0, 3 or 4, e.g. _war861_, _hgx670_, _vlh718_ and _bct067_.
    - Others tend to avoid very high ratings, e.g. _nhf461_, _dpn657_ and _vbp813_ rarely use score 4.
    - Yet others tend to pick central values, e.g. _ifw202_ and _gry278_ rarely score 0 or 4, and 2 more often than 1 and 3. 
    
These observations together suggest that individual annotator differ strongly in how they interpreted the answer options and rating scale. This further suggests that either annotator's responses need to debiased or that annotators need to be instructed and trained before starting the actual annotation task.

## Rating Tendencies for Individual Impact Categories

In [271]:
annotator_freq = get_annotator_freq(sents_done)

impact_freq = defaultdict(Counter)

for sent in sents_done:
    for anno in sent['annotations']:
        if anno['unanswerable']:
            anno_freq[anno['annotator']].update(['unanswerable'])
        elif anno['no_impact']:
            for scale in scales:
                impact_freq[scale].update([0])
        else:
            for scale in scales:
                impact_freq[scale].update([int(anno[scale])])
            
print('Total: total number of sentences annotated')
print('ANS: number of sentences answered')
print('UNANS: number of sentences that could not be answered')
print('NO_IM: number of sentences labelled as having no impact')
print()
print(f'Annotator\t    Total\t {" ".join([f"{score: >3}" for score in range(0,5)])}')
print('------------------------------------------------------------')
for scale in scales:
    scores = [impact_freq[scale][rating] for rating in range(0,5)]
    score_string = ' '.join([f"{score: >3}" for score in scores])
    print(f'{scale: <22}{sum(scores)}\t', score_string)
    
    
    

Total: total number of sentences annotated
ANS: number of sentences answered
UNANS: number of sentences that could not be answered
NO_IM: number of sentences labelled as having no impact

Annotator	    Total	   0   1   2   3   4
------------------------------------------------------------
emotional_scale       955	 439  67 112 119 218
style_scale           955	 709  59  73  58  56
reflection_scale      955	 658  95  77  68  57
narrative_scale       955	 596  40  86  95 138
surprise_scale        955	 719  56  84  58  38
attention_scale       955	 637  57  77 100  84
negative_scale        955	 681  66  79  62  67
humor_scale           955	 847  43  46   9  10


#### Observations

- There are substantial differences in the rating distributions of the different impact categories.
    - The _emotional impact_ category has 516 non-zero ratings.
    - The _humor_ category has 108 non-zero ratings.

- For some categories, annotators tend to choose high ratings (signalling the category is clearly present), such as _emotional impact_ and _narrative feeling_, while for others they tend to choose low ratings (signalling the category is vaguely present), such as _reflection_ and especially _humor_. 
    - For emotional impact, annotators find impact expressed in more than half of the sentences, and when they find it, it is mostly clear (rating 4 is by far the most frequent, and ratings 3 and 4 together make up 65% of all non-zero ratings.
    - The _humor_ category is less frequently present in sentences, but also least clearly present. 

#### Conclusions

- For the more frequent categories, the number of annotated sentences might be enough to reliably establish inter-annotator agreement and agreement between human raters and algorithmic impact models.
- For the infrequent categories, it is highly likely that more sentences need to be annotated to reliably establish agreement. 


Of course, reliably establishing agreement does not mean that the agreement will be high. The actual agreement is analysed below. 

## Qualtitative check of answers per scale

Below is code to quickly check the ratings per sentence for a given impact scale:

In [247]:
scale = 'humor_scale'
scale = 'negative_scale'
scale = 'attention_scale'

for sent in data:
    if sent['excluded'] is True:
        continue
    ratings = [get_scale_rating(anno, scale) for anno in sent['annotations']]
    print(ratings, sent['text'])
    print()

[3, 0, 0, 0] Only three children survived, and Taylor discovers that this trio, plus a Cadet and a Townie, developed an epic friendship that was the foundation of the many mysteries in her life and identity, as well as of the war games.

[0, 0, 0] It should be noted that this is more of a collection of morality tales than historical accounts.

['UN', 0, 3] It's a feel good book, so that lessens as time goes on. 
 

[0, 0, 0] It doesn't get much better than that!

[4, 0, 0] I really loved Anna.

[0, 0, 2] I think it was much better than The Lost Symbol and if you enjoyed the Da Vinci Code you'll enjoy this book.

[4, 4, 4] Just as addicting Perfect Chemistry, I once again stayed up later than I should've to finish it.

['UN', 0, 0] then it just spiralled out of control. 
 

[2, 3, 3] Her prose is delightfully witty and charming, but the shifting barrage of letters, plays and journal entries never lets us get too close to the main characters. 
 

[2, 3, 2] I struggled a little with the c

## Agreement and number of annotations

For many annotation tasks, there is a learning curve and people make increasingly consistent judgements as they make more judgements. This could mean that annotators who make only a handful of annotations, contribute less consistent annotations, therefore contribute more to disagreement than annotators who contribute many annotations. 

One way to investigate this is to filter annotations by removing those contributed by annotators who make fewer than 10 or 20 annotations. A consequence of leaving out annotations is that for some sentences, there will be fewer than three judgements. One solution for that is to only focus on sentences that have at least three annotations by annotators with many judgements.

### Inter-Annotator Agreement Measure

Above we saw that some impact categories rarely occur, which creates a big class imbalance. That is, the annotators indicate that the vast majority of sentences express no _humor impact_. Therefore, it is inevitable that they agree on a large number of sentences where no _humor impact_ is expressed. The inter-rater agreement measure $r_{wg}$ will therefore reflect high agreement, while it is possible that annotator sitll largely disagree about which sentences _do_ express _humor impact_. A measure that better handles these types of class or category imbalances is **Fleiss' Kappa** (a version of Cohen's Kappa for cases with more than 2 annotators).

In [78]:
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np

ratings = [[7,0], [7,0]]
fleiss_kappa(ratings, method='unif')

1.0

In [274]:
def get_scale_ratings(sents, scale, binarise: bool = False, binarise_min_rating: int = 2):
    num_cats = 5 if not binarise else 2
    all_ratings = []
    for sent_index, sent in enumerate(sents):
        ratings = []
        for anno in sent['annotations']:
            rating = get_scale_rating(anno, scale)
            if not isinstance(rating, int):
                continue
            if binarise:
                rating = 0 if rating < binarise_min_rating else 1
            ratings.append(rating)
        if len(ratings) < 3:
            continue
        elif len(ratings) > 3:
            ratings = ratings[:3]
        sent_ratings = np.zeros(num_cats)
        for rating in ratings:
            sent_ratings[rating] += 1
        all_ratings.append(sent_ratings)
    return all_ratings


def filter_ratings_by_annotator_freq(sents, min_freq=-1, max_freq=-1):
    annotator_freq = get_annotator_freq(sents)
    if max_freq < min_freq:
        max_freq = 100000000
    filtered_sents = []
    for sent in sents:
        annos = [anno for anno in sent['annotations'] if min_freq <= annotator_freq[anno['annotator']] <= max_freq]
        if len(annos) >= 3:
            filtered_sents.append({'annotations': annos})
    return filtered_sents


def analyse_binarisation(sents, scale):
    print('Scale:', scale)
    ratings = get_scale_ratings(sents, scale, binarise=False)
    print('\tNum sents with 3 ratings:', len(ratings))
    print("\tFleiss' Kappa non-binarised:", fleiss_kappa(ratings))
    ratings = get_scale_ratings(sents, scale, binarise=True, binarise_min_rating=1)
    print("\tFleiss' Kappa binarsed >=1:", fleiss_kappa(ratings))
    ratings = get_scale_ratings(sents, scale, binarise=True, binarise_min_rating=2)
    print("\tFleiss' Kappa binarsed >=2:", fleiss_kappa(ratings))
    ratings = get_scale_ratings(sents, scale, binarise=True, binarise_min_rating=3)
    print("\tFleiss' Kappa binarsed >=3:", fleiss_kappa(ratings))


def analyse_annotator_frequency(sents, scale, binarise: bool = False, binarise_min_rating: int = 1):
    print('Scale:', scale)
    ratings_all = get_scale_ratings(sents, scale, binarise=binarise, binarise_min_rating=binarise_min_rating)
    print('\tNum sents with 3 ratings, all annotators:', len(ratings_all))
    min_freq=10
    filtered_sents = filter_ratings_by_annotator_freq(sents, min_freq=min_freq)
    ratings_10 = get_scale_ratings(filtered_sents, scale, binarise=binarise, binarise_min_rating=binarise_min_rating)
    print(f'\tNum sents with 3 ratings, annotators >{min_freq}:', len(ratings_10))
    min_freq=11
    filtered_sents = filter_ratings_by_annotator_freq(sents, min_freq=min_freq)
    ratings_11 = get_scale_ratings(filtered_sents, scale, binarise=binarise, binarise_min_rating=binarise_min_rating)
    print(f'\tNum sents with 3 ratings, annotators >{min_freq}:', len(ratings_11))
    print("\tFleiss' Kappa all annotators:", fleiss_kappa(ratings_all))
    print(f"\tFleiss' Kappa annotators >10:", fleiss_kappa(ratings_10))
    print(f"\tFleiss' Kappa annotators >11:", fleiss_kappa(ratings_11))

scale = 'humor_scale'
analyse_binarisation(sents_done, scale)

scale = 'attention_scale'
analyse_binarisation(sents_done, scale)



Scale: humor_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.12442149981009688
	Fleiss' Kappa binarsed >=1: 0.1902289806903242
	Fleiss' Kappa binarsed >=2: 0.2496043668723198
	Fleiss' Kappa binarsed >=3: 0.40693196405648024
Scale: attention_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.19020005798782258
	Fleiss' Kappa binarsed >=1: 0.272935277031841
	Fleiss' Kappa binarsed >=2: 0.2847126742375327
	Fleiss' Kappa binarsed >=3: 0.28299115984735035


### Impact of Number of Annotated Sentences

For many annotation tasks, there are learning effects, in that annotators develop their understanding of the task and how to interpret the various categories they need to annotate. It is possible that as annotators annotate more sentences, they become more consistent in their annotations, and converge to more similar interpretations of the annotation categories.


Below we look at the agreement among annotators in different subsets of the annotators. We compare the agreement among three groups:

- **All**: All annotators: This includes annotators who stopped after one or a handful of sentences.
- **10**: Annotators who annotated at least a full batch of 10 sentences: these annotators perserved through at least a full batch, so have had more chance of coming across multiple sentences that express a certain impact category, but in different ways or with a different degree.
- **>10**: Annotators who annotated at more than a full batch of 10 sentences, were willing to do more, so may have perceived this as a task they understood and enjoyed enough to continue with a new batch. The frequency distribution at the top of this notebook shows that annotators who did more than 10 sentences all did at least 18 sentences.

### Impact of Binarisation

We also look at the affect on agreement of using the full rating scale versus different binarisations, where the scale is reduced to either zero or one. For instance:

- **>=1**: a binarisation of $r_{b} = 1 \leftarrow r >=1$ where any rating above zero is mapped to $1$, as it is a signal that a category is present, or 
- **>=3**: a binarisation of $r_{b} = 1 \leftarrow r >=3$ where any rating of 3 or 4 is mapped to $1$. This binarisation means only when the annotator thought the impact category is clearly present, is it counted as presence of that category. 



In [187]:
scales = [
    'emotional_scale', 'style_scale', 'reflection_scale', 'narrative_scale',
    'surprise_scale', 'attention_scale', 'negative_scale', 'humor_scale'    
]

for scale in scales:
    analyse_binarisation(sents_done, scale)

Scale: emotional_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.3356974305191401
	Fleiss' Kappa binarsed >=1: 0.5688705942886545
	Fleiss' Kappa binarsed >=2: 0.6089611499220988
	Fleiss' Kappa binarsed >=3: 0.5684552073440963
Scale: style_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.18391330573871112
	Fleiss' Kappa binarsed >=1: 0.2868237899398928
	Fleiss' Kappa binarsed >=2: 0.29841070479687004
	Fleiss' Kappa binarsed >=3: 0.36458755325836417
Scale: reflection_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.0910033334781772
	Fleiss' Kappa binarsed >=1: 0.18803418803418803
	Fleiss' Kappa binarsed >=2: 0.12918660287081338
	Fleiss' Kappa binarsed >=3: 0.0752808376049735
Scale: narrative_scale
	Num sents with 3 ratings: 266
	Fleiss' Kappa non-binarised: 0.23324424935812277
	Fleiss' Kappa binarsed >=1: 0.39665634674922606
	Fleiss' Kappa binarsed >=2: 0.33957115009746597
	Fleiss' Kappa binarsed >=3: 0.361477403661503
Scale: s

In [188]:
scales = [
    'emotional_scale', 'style_scale', 'reflection_scale', 'narrative_scale',
    'surprise_scale', 'attention_scale', 'negative_scale', 'humor_scale'    
]

min_freq=10
filtered_sents = filter_ratings_by_annotator_freq(sents_done, min_freq=min_freq)
print(f'sentences with annotations by raters with at least {min_freq} sentences:', len(filtered_sents))

for scale in scales:
    analyse_binarisation(filtered_sents, scale)

sentences with annotations by raters with at least 10 sentences: 267
Scale: emotional_scale
	Num sents with 3 ratings: 215
	Fleiss' Kappa non-binarised: 0.31898119466815883
	Fleiss' Kappa binarsed >=1: 0.5629123233611703
	Fleiss' Kappa binarsed >=2: 0.6019286403085822
	Fleiss' Kappa binarsed >=3: 0.5718149931633076
Scale: style_scale
	Num sents with 3 ratings: 215
	Fleiss' Kappa non-binarised: 0.1879508727941095
	Fleiss' Kappa binarsed >=1: 0.28111068036441195
	Fleiss' Kappa binarsed >=2: 0.30118167109672955
	Fleiss' Kappa binarsed >=3: 0.349579831932773
Scale: reflection_scale
	Num sents with 3 ratings: 215
	Fleiss' Kappa non-binarised: 0.0783242540547233
	Fleiss' Kappa binarsed >=1: 0.15318288269107957
	Fleiss' Kappa binarsed >=2: 0.10031971914866906
	Fleiss' Kappa binarsed >=3: 0.09016279715104945
Scale: narrative_scale
	Num sents with 3 ratings: 215
	Fleiss' Kappa non-binarised: 0.2316517428845537
	Fleiss' Kappa binarsed >=1: 0.3786991062562062
	Fleiss' Kappa binarsed >=2: 0.361873

In [189]:
scales = [
    'emotional_scale', 'style_scale', 'reflection_scale', 'narrative_scale',
    'surprise_scale', 'attention_scale', 'negative_scale', 'humor_scale'    
]

min_freq=11
filtered_sents = filter_ratings_by_annotator_freq(sents_done, min_freq=min_freq)
print(f'sentences with annotations by raters with at least {min_freq} sentences:', len(filtered_sents))

for scale in scales:
    analyse_binarisation(filtered_sents, scale)

sentences with annotations by raters with at least 11 sentences: 61
Scale: emotional_scale
	Num sents with 3 ratings: 50
	Fleiss' Kappa non-binarised: 0.376592036466014
	Fleiss' Kappa binarsed >=1: 0.6532716927453769
	Fleiss' Kappa binarsed >=2: 0.6771879483500718
	Fleiss' Kappa binarsed >=3: 0.6368038740920097
Scale: style_scale
	Num sents with 3 ratings: 50
	Fleiss' Kappa non-binarised: 0.26547743966421816
	Fleiss' Kappa binarsed >=1: 0.3355481727574753
	Fleiss' Kappa binarsed >=2: 0.431818181818182
	Fleiss' Kappa binarsed >=3: 0.44852941176470656
Scale: reflection_scale
	Num sents with 3 ratings: 50
	Fleiss' Kappa non-binarised: 0.16219839142091194
	Fleiss' Kappa binarsed >=1: 0.37033582089552225
	Fleiss' Kappa binarsed >=2: 0.3262212240314427
	Fleiss' Kappa binarsed >=3: 0.20774647887324027
Scale: narrative_scale
	Num sents with 3 ratings: 50
	Fleiss' Kappa non-binarised: 0.40662373505059773
	Fleiss' Kappa binarsed >=1: 0.6237458193979932
	Fleiss' Kappa binarsed >=2: 0.646226415094

In [190]:
scales = [
    'emotional_scale', 'style_scale', 'reflection_scale', 'narrative_scale',
    'surprise_scale', 'attention_scale', 'negative_scale', 'humor_scale'    
]

for scale in scales:
    analyse_annotator_frequency(sents_done, scale, binarise=True, binarise_min_rating=1)


Scale: emotional_scale
	Num sents with 3 ratings, all annotators: 266
	Num sents with 3 ratings, annotators >10: 215
	Num sents with 3 ratings, annotators >11: 50
	Fleiss' Kappa all annotators: 0.5688705942886545
	Fleiss' Kappa annotators >10: 0.5629123233611703
	Fleiss' Kappa annotators >11: 0.6532716927453769
Scale: style_scale
	Num sents with 3 ratings, all annotators: 266
	Num sents with 3 ratings, annotators >10: 215
	Num sents with 3 ratings, annotators >11: 50
	Fleiss' Kappa all annotators: 0.2868237899398928
	Fleiss' Kappa annotators >10: 0.28111068036441195
	Fleiss' Kappa annotators >11: 0.3355481727574753
Scale: reflection_scale
	Num sents with 3 ratings, all annotators: 266
	Num sents with 3 ratings, annotators >10: 215
	Num sents with 3 ratings, annotators >11: 50
	Fleiss' Kappa all annotators: 0.18803418803418803
	Fleiss' Kappa annotators >10: 0.15318288269107957
	Fleiss' Kappa annotators >11: 0.37033582089552225
Scale: narrative_scale
	Num sents with 3 ratings, all annota

#### Observations

The binarisation has, almost by definition, a positive impact on agreement. 

## Disagreeing annotators

Some annotators disagree more often with other annotators than other annotators. Check how often and by how much annotators disagree.

Purpose: Can we identify people who either interpreted impact categories very differently from others, or give random answers or purposefully wrong answers?

In [256]:
from collections import defaultdict, Counter
from itertools import combinations

from scripts_en.human_rater_analysis import get_rater_scores
from scripts_en.config import config


agreed = defaultdict(Counter)

for sent in data:
    if sent['excluded'] is True:
        continue
    ratings = []
    for anno in sent['annotations']:
        for anno1, anno2 in combinations(sent['annotations'], 2):
            rating1 = get_scale_rating(anno1, scale)
            rating2 = get_scale_rating(anno2, scale)
            if (rating1 == "UN" and rating2 != "UN") or (rating1 != "UN" and rating2 == "UN"):
                diff = -1
            elif rating1 == "UN" and rating2 == "UN":
                diff = 0
            else:
                diff = abs(rating1 - rating2)
            agreed[anno1['annotator']].update([diff])
            agreed[anno2['annotator']].update([diff])
                
        scores = get_rater_scores(sent, scale, config)
    print([get_scale_rating(anno, scale) for anno in sent['annotations']], [anno['annotator'] for anno in sent['annotations']])


[3, 0, 0, 0] ['vbp813', 'aeq969', 'sdu514', 'nuz535']
[0, 0, 0] ['aeq969', 'nuz535', 'sdu514']
['UN', 0, 3] ['aeq969', 'nuz535', 'sdu514']
[0, 0, 0] ['aeq969', 'sdu514', 'nuz535']
[4, 0, 0] ['aeq969', 'sdu514', 'nuz535']
[0, 0, 2] ['aeq969', 'nuz535', 'sdu514']
[4, 4, 4] ['aeq969', 'nuz535', 'sdu514']
['UN', 0, 0] ['aeq969', 'nuz535', 'sdu514']
[2, 3, 3] ['npo717', 'ydv934', 'upx953']
[2, 3, 2] ['ydv934', 'ebf417', 'ifw202']
['UN', 1, 0] ['npo717', 'ydv934', 'upx953']
[2, 3, 3] ['npo717', 'ydv934', 'upx953']
[3, 0, 4] ['npo717', 'ydv934', 'ebf417']
[0, 3, 3] ['ydv934', 'npo717', 'ifw202']
['UN', 0, 0] ['npo717', 'ydv934', 'upx953']
[0, 0, 4] ['npo717', 'ydv934', 'ebf417']
[3, 4, 0] ['ifw202', 'yaq494', 'nhf461']
[3, 2, 3] ['ifw202', 'yaq494', 'nhf461']
[4, 3, 2] ['ifw202', 'yaq494', 'nhf461']
[3, 4, 1] ['ifw202', 'yaq494', 'nhf461']
[3, 'UN', 1] ['ifw202', 'yaq494', 'nhf461']
[4, 4, 2] ['ifw202', 'yaq494', 'nhf461']
[2, 3, 'UN'] ['ifw202', 'yaq494', 'nhf461']
[3, 3, 'UN'] ['ifw202', 'y

In [262]:
for rater in agreed:
    score_string = f"{' '.join([f'{agreed[rater][rating]: >4}' for rating in range(0, 5)])}"
    print(rater, score_string)

vbp813   12   24    0   18    0
aeq969   35    6    3    4    6
sdu514  133   12   40   36   16
nuz535   39    4   15   11    3
npo717    6   15    0    6    3
ydv934   24   16    4   19    6
upx953    9    9    0    0    0
ebf417    0    9    0    0    9
ifw202   15   18   12    6    0
yaq494   12   21    9    3    3
nhf461   33   18   24    6    3
gry278   27    3   15    3    0
nrb271   84   18   27    6    0
nqe273   12   12    9    6    0
ezi755   36    9    0    3    0
der817   60   12   12    0    0
gjx882   34   11   12    8    0
riq511    6    0    0    0    0
dpn657   26   43   29   32    4
guv635    8   32   31   12    8
qdg805   42   40   35   18    4
hhq065   15   14    9    7    0
cru076  318   39   33   15   24
naz523  189   18    9    9    9
sxd639    6    0   15   12   12
ukp255   51    6    0    3    0
wlv622  108    3    3    0    6
sdz396   21    3   15   12    6
xlr671    9    9   27    6    3
jot226    3    0    3    0    0
ehp730    3    0    3    0    0
ntu036  

#### Observations

Some annotators who seem to mostly disagree strongly with others are:
- sxd639
- pkm714
- mjd186
- whs510
- mlh640

This could be a signal that they misinterpreted the impact categories, or the rating scale.

It could also signal that they purposefully gave random or wrong answers.

## Misunderstanding Annotators

Some annotators may have thought that the default rating is the middle one ($rating=2$ in this case), such that even a rating of 1 is a signal that an impact category is not clearly present. That is, perhaps they thought that 0 is absolute certainty that some category is present, while 1 leaves some doubt,, so they are inclined to always rate all categories higher than 0.

In [265]:
import copy

non_zeros = Counter()

for sent in data:
    if sent['excluded'] is True:
        continue
    sent = copy.deepcopy(sent)
    for anno in sent['annotations']:
        ratings = [get_scale_rating(anno, field) for field in anno if '_scale' in field]
        digit_ratings = [rating for rating in ratings if isinstance(rating, int)]
        non_zero_ratings = [rating for rating in digit_ratings if rating > 0]
        #print(len(scores))
        if len(non_zero_ratings) == 8:
            print(anno['annotator'], non_zero_ratings)
            non_zeros.update([anno['annotator']])

for rater, freq in non_zeros.most_common():
    print(rater, freq)

ifw202 [2, 2, 3, 1, 4, 1, 3, 1]
ifw202 [2, 3, 3, 3, 2, 1, 3, 1]
ifw202 [2, 2, 3, 3, 2, 2, 2, 2]
ifw202 [2, 3, 3, 2, 3, 1, 2, 3]
ifw202 [2, 3, 2, 3, 2, 2, 2, 3]
ifw202 [3, 2, 3, 2, 2, 2, 2, 2]
yaq494 [2, 4, 4, 4, 4, 4, 4, 2]
gry278 [3, 2, 2, 3, 3, 1, 2, 2]
gry278 [1, 1, 2, 2, 1, 1, 1, 3]
dpn657 [2, 1, 2, 2, 1, 1, 3, 1]
dpn657 [4, 2, 3, 2, 2, 2, 2, 2]
dpn657 [2, 3, 4, 3, 2, 1, 2, 1]
pkm714 [1, 3, 3, 1, 1, 1, 2, 2]
pkm714 [3, 4, 3, 1, 3, 1, 2, 2]
pkm714 [2, 2, 3, 1, 4, 1, 2, 4]
eoh682 [3, 2, 3, 1, 3, 1, 2, 2]
jge523 [2, 2, 3, 2, 2, 2, 2, 2]
eoh682 [3, 1, 3, 1, 1, 1, 3, 1]
dpn657 [2, 2, 2, 1, 2, 1, 2, 2]
mjd186 [2, 3, 3, 1, 2, 2, 3, 3]
whs510 [1, 3, 4, 1, 1, 1, 1, 3]
mlh640 [4, 3, 1, 2, 2, 2, 2, 2]
pde699 [2, 2, 2, 2, 2, 2, 2, 2]
pde699 [3, 3, 4, 3, 3, 3, 3, 4]
pde699 [2, 2, 3, 2, 2, 1, 1, 2]
pde699 [1, 1, 1, 1, 2, 2, 1, 1]
whs510 [3, 4, 4, 1, 3, 1, 1, 1]
pgu653 [1, 1, 3, 1, 4, 1, 1, 3]
dpn657 [4, 2, 3, 1, 3, 1, 1, 3]
rte579 [3, 4, 1, 3, 3, 1, 1, 2]
rte579 [3, 3, 2, 2, 2, 2, 2, 1]
rte579 [

In [48]:
def get_annotator_anno(sent, annotator):
    for anno in sent['annotations']:
        if anno['annotator'] == annotator:
            return anno
    
def has_annotator(sent, annotator):
    return annotator in [anno['annotator'] for anno in sent['annotations']]

def filter_sents_by_annotator(sents, annotator):
    return [sent for sent in sents if has_annotator(sent, annotator)]

annotator = 'ifw202'
sents = filter_sents_by_annotator(data, annotator)
len(sents)
for sent in sorted(sents, key = lambda sent: get_annotator_anno(sent, annotator)['created']):
    anno = get_annotator_anno(sent, annotator)
    scores = [anno[field] for field in anno if '_scale' in field and int(anno[field])]
    print(sent['text'])
    print(anno['created'], anno['modified'] if 'modified' in anno else None)
    print(scores)

To say this read takes you on a voyage is not giving it enough credit.
2020-10-12T19:28:38.583004+00:00 None
['2', '2', '3', '1', '4', '1', '3', '1']
I struggled a little with the connection between them... and I don't really understand how some of the secondary characters actually fit. 
 
2020-10-12T19:30:04.539341+00:00 None
['2', '2', '2', '3', '3', '1', '3']
And then you'll give me the '2 months later' crap next chapter wherein he's instantly recovered from a brain tumor.
2020-10-12T19:31:03.258800+00:00 None
['2', '3', '3']
The techniques are all clearly laid out and there are a good variety of projects to choose from.
2020-10-12T19:32:47.698929+00:00 None
['2', '2', '3', '3', '2', '2', '2', '2']
And then there was Jacob O'Connor.
2020-10-12T19:33:34.769477+00:00 None
['3', '2', '3', '2', '2', '2', '2', '2']
Well it takes others a little longer.
2020-10-12T19:35:10.229683+00:00 None
['2', '3', '2', '3', '2', '2', '2', '3']
This was a wonderful thid book for this series and I'm so 

For annotator `ifw202` the switch between moving all buttons and moving only a few seems unrelated to the order in which they annotated the sentences. It does not look like an initial misunderstanding that got cleared up after a few sentences.

## Lemmatizing sentences



In [19]:
import spacy
from spacy import displacy


spacy.__version__

'3.0.6'

In [2]:
nlp = spacy.load("en_core_web_trf")

In [20]:
for sent in data[225:]:
    doc = nlp(sent['text'])
    break
    
for chunk in doc.noun_chunks:
    print(chunk)
    
    
displacy.render(doc, style='dep')

Only three children
Taylor
this trio
a Cadet
a Townie
an epic friendship
the foundation
the many mysteries
her life
identity
the war games


In [21]:
for ent in doc.ents:
    print(ent.text, ent.start, ent.label_)
    
for token in doc:
    print(token.text, token.pos_, token.lemma_)

Only three 0 CARDINAL
Taylor 6 PERSON
Cadet 14 PRODUCT
Townie 17 PRODUCT
Only ADV only
three NUM three
children NOUN child
survived VERB survive
, PUNCT ,
and CCONJ and
Taylor PROPN Taylor
discovers VERB discover
that SCONJ that
this DET this
trio NOUN trio
, PUNCT ,
plus CCONJ plus
a DET a
Cadet PROPN Cadet
and CCONJ and
a DET a
Townie PROPN Townie
, PUNCT ,
developed VERB develop
an DET an
epic ADJ epic
friendship NOUN friendship
that DET that
was VERB be
the DET the
foundation NOUN foundation
of ADP of
the DET the
many ADJ many
mysteries NOUN mystery
in ADP in
her PRON her
life NOUN life
and CCONJ and
identity NOUN identity
, PUNCT ,
as ADV as
well ADV well
as ADP as
of ADP of
the DET the
war NOUN war
games NOUN game
. PUNCT .
