# Corpus Stats

In [1]:
# Allows for seamless use of updated src

%load_ext autoreload
%autoreload 2

# Switch to top of curiam directory for easier paths
%cd ../../


/home/mkranzlein/michael/dev/curiam


In [67]:
import statistics

from collections import Counter

from curiam import categories
from curiam.inception import tsv_processing

from pathlib import Path

## Data stats

### Table: total number of opinions and number of majority, concurring, and dissenting opinions

In [3]:
opinion_counts = {}

# These are plain text files that whose names have been standarized.
# TODO: Fix filenames in INCEpTION so exported filenames can be used directly
opinions_with_docket_numbers_dir = Path(
    "data/full_scale/processed/inception_files_with_docket_numbers/")

for opinion_path in opinions_with_docket_numbers_dir.glob("*.txt"):
    # First 2 pieces make up docket number (e.g. 17_834), which we don't need
    _, _, opinion_type, author = opinion_path.name.split("_")
    # Reformat authors for displaying in table
    author = author[:-4]
    author = author[0].upper() + author[1:]
    if author not in opinion_counts.keys():
        opinion_counts[author] = Counter()
    opinion_counts[author][opinion_type] += 1

authors = sorted(opinion_counts.keys())
total = 0
majority_total = 0
concurrence_total = 0
dissent_total = 0
table_output = ""
for author in authors:
    # Calculate row sums and increment column totals
    majority = opinion_counts[author]["ootc"]
    majority_total += majority
    concurrence = opinion_counts[author]["concurrence"]
    concurrence_total += concurrence
    dissent = opinion_counts[author]["dissent"]
    dissent_total += dissent
    # Total for this particular justice
    justice_total = (majority + concurrence + dissent)
    total += justice_total
    table_output += f"{author} & {majority} & {concurrence} & {dissent} & {justice_total}\\\\\n"
table_output += "\\bottomrule\n"
table_output += f"& {majority_total} & {concurrence_total} & {dissent_total} & {total}\\\\"
output_path = Path("results/tables/opinion_characteristics.txt")
with output_path.open("w") as f:
    f.write(table_output)
print(f"Table saved to {output_path.as_posix()}")

Table saved to results/tables/opinion_characteristics.txt


In [97]:
opinions_dir = Path("data/full_scale/annotated")

# These are lists of opinions which are lists of sentences
# which are lists of tokens.
# E.g. opinions_m[0][0][0] is the 0-th token of the 0-th sentence of the 
# 0-th opinion in the agreement study.
opinions = [tsv_processing.process_opinion_file(opinion_path)
            for opinion_path in sorted(opinions_dir.glob("*.tsv"), key= lambda path: path.name)]

assert len(opinions) == 41



### How many tokens are there?

In [107]:
token_total = sum([len(sentence) for opinion in opinions for sentence in opinion])
token_total

179690

### How many sentences are there?

In [99]:
sum([len(opinion) for opinion in opinions])

7068

## Annotation Stats

In [100]:
# All of the annotations in the corpus, grouped by sentence
all_annotations = [tsv_processing.get_sentence_annotations(sentence)
               for opinion in opinions for sentence in opinion]

# Remove sentences which have 0 annotations, which will show as empty lists
all_annotations = [x for x in all_annotations if len(x) > 0]

# Annotations for the first sentence that has any annotations
all_annotations[0]

[['Metalinguistic Cue', 8, 8],
 ['Definition', 9, 32],
 ['Direct Quote', 24, 27],
 ['Focal Term', 25, 26]]

### How many annotations?

In [105]:
sum([len(sentence_annotations) for sentence_annotations in all_annotations])

9820

### How many sentences with at least one annotation?

In [106]:
len(all_annotations)

4447

### What percentage of tokens are covered by at least one annotation?

In [109]:
def get_token_coverage(sentence, annotation_column=2):
    return sum([1 if len(token[annotation_column]["categories"]) > 0
                else 0 for token in sentence])

coverage = sum([get_token_coverage(sentence, 2) for opinion in opinions for sentence in opinion])

print(f"Tokens annotated with at least one category: {coverage} ({(coverage/token_total) * 100:.2f}%)")


Tokens annotated with at least one category: 68718 (38.24%)


### Table: number of annotation for each category and their average length



In [60]:
category_freqs_and_lens = {}
for sentence_annotations in all_annotations:
    for annotated_span in sentence_annotations:
        category, start, end = [val for val in annotated_span]
        category_freqs_and_lens.setdefault(category, {"count": 0, "lengths": [] })
        category_freqs_and_lens[category]["count"] += 1
        category_freqs_and_lens[category]["lengths"].append((end - start) + 1)


In [95]:
for category in categories.ORDERED_CATEGORIES:
    frequency = category_freqs_and_lens[category]["count"]
    mean = statistics.mean(category_freqs_and_lens[category]["lengths"])
    st_dev = statistics.stdev(category_freqs_and_lens[category]["lengths"])
    print(f"{category} & {frequency} & {mean:.1f}{'{'} ({st_dev:.1f}){'}'}\\\\")

Focal Term & 1043 & 2.5{ (1.8)}\\
Definition & 273 & 12.2{ (9.4)}\\
Metalinguistic Cue & 1784 & 1.3{ (0.7)}\\
Direct Quote & 2577 & 10.9{ (10.1)}\\
Language Source & 74 & 10.0{ (4.3)}\\
Legal Source & 3706 & 8.6{ (8.2)}\\
Named Interpretive Rule & 51 & 5.1{ (7.1)}\\
Example Use & 115 & 23.5{ (12.5)}\\
Appeal to Meaning & 196 & 27.8{ (13.0)}\\


In [110]:
### Scatter plot opinion length vs coverage percentage