## Statistics about the scraped corpus

For each level and for the corpus overall:
* Number of sentences
* Number of tokens
* Number of types

In [1]:
from utils import read_corpus
import nltk
import pprint

In [2]:
corpus = read_corpus()

In [3]:
stat_dict = {}
tokens_types_dict = {}
for level, list_of_texts in corpus.items():
    stats_per_text = []
    tt_per_text = []
    for text in list_of_texts:
        num_sents = len(nltk.tokenize.sent_tokenize(text["content"]))
        tokens = nltk.wordpunct_tokenize(text["content"])
        num_tokens = len(tokens)
        types = set(tokens)
        num_types = len(types)
        stats_per_text.append(
            {
                "num_sents": num_sents,
                "num_tokens": num_tokens,
                "num_types": num_types,
            }
        )
        tt_per_text.append(
            {
                "tokens": tokens,
                "types": types,
            }
        )
    stat_dict[level] = stats_per_text
    tokens_types_dict[level] = tt_per_text

In [4]:
for level, l in stat_dict.items():
    print(level)
    for i, d in enumerate(l):
        if i == 5:
            break
        print(d)
    print()

A1
{'num_sents': 6, 'num_tokens': 134, 'num_types': 83}
{'num_sents': 7, 'num_tokens': 257, 'num_types': 133}
{'num_sents': 6, 'num_tokens': 206, 'num_types': 118}
{'num_sents': 4, 'num_tokens': 103, 'num_types': 67}
{'num_sents': 155, 'num_tokens': 1631, 'num_types': 526}

A2
{'num_sents': 35, 'num_tokens': 2153, 'num_types': 757}
{'num_sents': 42, 'num_tokens': 2588, 'num_types': 887}
{'num_sents': 44, 'num_tokens': 2733, 'num_types': 875}
{'num_sents': 75, 'num_tokens': 2974, 'num_types': 949}
{'num_sents': 40, 'num_tokens': 1940, 'num_types': 665}

B2
{'num_sents': 67, 'num_tokens': 1951, 'num_types': 564}
{'num_sents': 60, 'num_tokens': 3139, 'num_types': 843}
{'num_sents': 33, 'num_tokens': 3163, 'num_types': 766}
{'num_sents': 15, 'num_tokens': 242, 'num_types': 136}
{'num_sents': 164, 'num_tokens': 5147, 'num_types': 1437}

B1
{'num_sents': 108, 'num_tokens': 1915, 'num_types': 629}
{'num_sents': 62, 'num_tokens': 1797, 'num_types': 590}
{'num_sents': 21, 'num_tokens': 1269, 'n

## Aggregate Statistics

In [5]:
least_sents = 10000
least_tokens = 10000
least_types = 10000
most_sents = 0
most_tokens = 0
most_types = 0
total_sents = 0
total_tokens = 0
breakdown_by_level = {}
for level, stat_list in stat_dict.items():
    level_sents = 0
    level_tokens = 0
    for stats in stat_list:
        n_sents = stats["num_sents"]
        n_tokens = stats["num_tokens"]
        n_types = stats["num_types"]
        if n_sents < least_sents:
            least_sents = n_sents
        if n_tokens < least_tokens:
            least_tokens = n_tokens
        if n_types < least_types:
            least_types = n_types
        
        if n_sents > most_sents:
            most_sents = n_sents
        if n_tokens > most_tokens:
            most_tokens = n_tokens
        if n_types > most_types:
            most_types = n_types
        
        level_sents += n_sents
        level_tokens += n_tokens
        total_sents += n_sents
        total_tokens += n_tokens
        
    breakdown_by_level[level] = {
        "total_sents": level_sents,
        "total_tokens": level_tokens,
    }

total_types = set()
for level, tt_list in tokens_types_dict.items():
    level_types = set()
    for tt in tt_list:
        types = tt["types"]
        level_types |= types
        total_types |= types
    breakdown_by_level[level]["total_types"] = len(level_types)
total_types = len(total_types)        

total_texts = 0
for k, v in corpus.items():
    print(f"Number of {k}-level texts: {len(v)}")
    total_texts += len(v)
print(f"Total number of texts in the corpus: {total_texts}")

print(f"\nMost and least sentences in any document in the corpus:\n\tMost: {most_sents},\tLeast: {least_sents}")
print(f"Most and least tokens in any document in the corpus:\n\tMost: {most_tokens},\tLeast: {least_tokens}")
print(f"Most and least types in any document in the corpus:\n\tMost: {most_types},\tLeast: {least_types}")

print("\nBreakdown of total sentences, tokens and types by reading level:")
pprint.pprint(breakdown_by_level)

print(f"\nTotal number of sentences in corpus: {total_sents}")
print(f"Total number of tokens in corpus: {total_tokens}")
print(f"Total number of types in corpus: {total_types}")

Number of A1-level texts: 94
Number of A2-level texts: 62
Number of B2-level texts: 110
Number of B1-level texts: 42
Total number of texts in the corpus: 308

Most and least sentences in any document in the corpus:
	Most: 554,	Least: 1
Most and least tokens in any document in the corpus:
	Most: 5470,	Least: 12
Most and least types in any document in the corpus:
	Most: 1696,	Least: 10

Breakdown of total sentences, tokens and types by reading level:
{'A1': {'total_sents': 7258, 'total_tokens': 74484, 'total_types': 6559},
 'A2': {'total_sents': 2805, 'total_tokens': 53883, 'total_types': 8241},
 'B1': {'total_sents': 1837, 'total_tokens': 40892, 'total_types': 7548},
 'B2': {'total_sents': 3183, 'total_tokens': 79013, 'total_types': 13787}}

Total number of sentences in corpus: 15083
Total number of tokens in corpus: 248272
Total number of types in corpus: 23751
