In [1]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
from os.path import join
import stanza

## Create a list of all the files in the corpus

In [4]:
texts = []
dir = 'Corpus'

for entry in os.listdir(dir):
    # dir + subdirectory
    path = os.path.join(dir,entry)
    texts.append(path)



## create text analysis functions

In [5]:
stanza.download('nl')      
nlp = stanza.Pipeline('nl')

def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-04-05 04:04:53 INFO: Downloading default packages for language: nl (Dutch)...


Downloading https://huggingface.co/stanfordnlp/stanza-nl/resolve/v1.3.0/models/default.zip:   0%|          | 0…

2022-04-05 04:06:06 INFO: Finished downloading models and saved to /Users/verhaarpaf/stanza_resources.
2022-04-05 04:06:06 INFO: Loading these models for language: nl (Dutch):
| Processor | Package |
-----------------------
| tokenize  | alpino  |
| pos       | alpino  |
| lemma     | alpino  |
| depparse  | alpino  |
| ner       | conll02 |

2022-04-05 04:06:06 INFO: Use device: cpu
2022-04-05 04:06:06 INFO: Loading: tokenize
2022-04-05 04:06:06 INFO: Loading: pos
2022-04-05 04:06:06 INFO: Loading: lemma
2022-04-05 04:06:06 INFO: Loading: depparse
2022-04-05 04:06:06 INFO: Loading: ner
2022-04-05 04:06:08 INFO: Done loading processors!


## create data for all  the texts in the corpus

In [7]:
out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

pos_tags = ['ADJ' , 'ADV' , 'VERB' , 'NOUN']

## Header of the CSV file
out.write('title,tokens,sentences,ttr')

for t in pos_tags:
    out.write(f',{t}')
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    ## count the number of sentences
    sentences = sent_tokenize(full_text)
    data['nr_sentences'] = len(sentences)
    
    # dictionary to count the POS tags
    freq_pos = dict()    
    
    # variables for the calculation of type-token ratio
    ttr_cap = 3000
    freq_ttr = dict()
 
    # token count is initalised at 0
    data['nr_tokens'] = 0

    for s in sentences:
        words = word_tokenize(s)
        words = remove_punctuation(words)
        
        doc = nlp(s)
    
        for sent in doc.sentences:
            for word in sent.words:
                word.text = word.text.lower()
                #print(word.text , ' => ' , word.upos)
    
                # count the tokens
                data['nr_tokens'] += 1

                # place tokens in dictionary freq_ttr
                # only if the word count is less than ttr_cap
                # The nr of items in the dictionary eventually equals the nr of types
                if data['nr_tokens'] <= ttr_cap:
                    freq_ttr[ word.text ] = freq_ttr.get( word.text , 0 ) + 1
                
            ## Count frequencies of all the POS tags
            freq_pos[ word.upos ] = freq_pos.get( word.upos ,0) +1
            

    for t in pos_tags:
        data[t] = freq_pos.get(t,0)
                
    # Calculate TTR: number of items in freq_ttr dictyionary
    # divided by ttr_cap
    data['ttr'] = len( freq_ttr ) / ttr_cap
    
    # write the results to a CSV file    
    out.write( f"{title},{data['nr_tokens']},{data['nr_sentences']},{data['ttr']}" )
    for t in pos_tags:
        out.write( f",{data[t] / data['nr_tokens'] }"  )
    out.write('\n')
    
out.close()
print('Done!')

Analysing Corpus/Kronijk_der_stad_Purmerende.txt ...
Analysing Corpus/Chronijk_van_Maestricht_tot_1719.txt ...
Done!
