## Import necessary packages and NLTK resources

Intall `stanza` if necessary. Remove the hash before you run the code below. 

In [1]:
#pip install -U stanza

In [2]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
from os.path import join
import stanza

In [3]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/verhaarpaf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/verhaarpaf/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/verhaarpaf/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/verhaarpaf/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/verhaarpaf/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

## Create a list of all the files in the Corpus

The code below assumes that all the text files are in subfolders underneath 'Corpus', e.g. 'Corpus/Ricardo_Reis/RR_Sob_a_leve_tutela.txt' or 'Corpus/Fernando_Pessoa/FP_Autopsicografia.txt'. 

The list `files()` contains the full paths to these texts. 

In [4]:
texts = []
dir = r'Corpus'
files = []

for entry in os.listdir(dir):
    # dir + subdirectory
    path = os.path.join(dir,entry)
    if os.path.isdir(path):
        
        for file in os.listdir(path):
            # dir + subdirectory + file 
            file_path = join(path,file)
            texts.append( file_path )
            if re.search( r'txt$' , file_path ):
                print( file_path )
                files.append(file_path)

Corpus/Ricardo_Reis/RR_Sob_a_leve_tutela.txt
Corpus/Ricardo_Reis/RR_Uns_com_os_olhos_postos_no_passado.txt
Corpus/Ricardo_Reis/RR_Prefiro_rosas_meu_amor_à_pátria.txt
Corpus/Álvaro_de_Campos/AdA_Tabacaria.txt
Corpus/Álvaro_de_Campos/AdA_Ode_Triunfal.txt
Corpus/Alberto_Caeiro/AC_V_Guardador_de_Rebanhos.txt
Corpus/Alberto_Caeiro/AC_IX_Guardador_de_Rebanhos.txt
Corpus/Fernando_Pessoa/FP_Quando_era_criança.txt
Corpus/Fernando_Pessoa/FP_Autopsicografia.txt
Corpus/Fernando_Pessoa/FP_Isto.txt
Corpus/Fernando_Pessoa/FP_Quando_as_crianças_brincam.txt
Corpus/Fernando_Pessoa/FP_Gato_que_brincas_na_rua.txt


## Create an Stanza NLP object to analyse the Portuguese texts

We use the Portuguese ('pt') model. 

In [5]:
stanza.download('pt')       
nlp = stanza.Pipeline('pt')

def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-04-10 14:45:11 INFO: Downloading default packages for language: pt (Portuguese)...
2022-04-10 14:45:11 INFO: File exists: /Users/verhaarpaf/stanza_resources/pt/default.zip.
2022-04-10 14:45:13 INFO: Finished downloading models and saved to /Users/verhaarpaf/stanza_resources.
2022-04-10 14:45:13 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |
| depparse  | bosque  |

2022-04-10 14:45:13 INFO: Use device: cpu
2022-04-10 14:45:13 INFO: Loading: tokenize
2022-04-10 14:45:13 INFO: Loading: mwt
2022-04-10 14:45:13 INFO: Loading: pos
2022-04-10 14:45:13 INFO: Loading: lemma
2022-04-10 14:45:13 INFO: Loading: depparse
2022-04-10 14:45:13 INFO: Done loading processors!


## Create the data for each poem

Using the `files` list that was created earlier, wee create data for each poem successevely. 


In [8]:
from tdmh import *
import re
from nltk.tokenize import word_tokenize

out = open( 'data.csv' , 'w' , encoding='utf-8' )
out.write('title,heteronym,tokens,types,adjectives,adverbs,verbs,nouns\n')

pos_tags = ['ADJ' , 'ADV' , 'VERB' , 'NOUN']


for text in files:
    freq_vocabulary = dict()
    freq_pos = dict()
    tagged_words = dict()

    # we extract the title from the full path
    # using the get_title() function
    print( f'Analysing {get_title(text)} ... ')
    
    ## We also extract the heternym form the full path
    # The heteronym can be found in the second part
    # of the path
    path = os.path.normpath(text)
    parts = path.split(os.sep)
    heteronym = parts[1].strip()
    print( f'Heteronym: {heteronym}')
    
    with open( text , encoding = 'utf-8') as poem:
        full_text = poem.read()
        nr_sentences =  0
        nr_tokens = 0 
        
        # the nlp() function in NLP recognises 
        # sentences, words and POS tags
        
        doc = nlp(full_text)
    
        for sent in doc.sentences:
            nr_sentences += 1
               
            for word in sent.words:
                # words are in word.text
                # POS tags are saved as word.upos
                
                nr_tokens += 1
                word.text = word.text.lower()
                tagged_words[word.text] = word.upos
                freq_pos[ word.upos ] = freq_pos.get( word.upos ,0) +1
                freq_vocabulary[word.text] = freq_vocabulary.get(word.text,0) + 1
        
        words = list(freq_vocabulary.keys()) 
        print(f'The poem has {nr_tokens} words in total:')
        print(f'The poem has {len(words)} unique words (types):')
        print(words)
        
        print(f"There are { freq_pos.get('ADJ',0) } adjectives:")
        for word in tagged_words:
            if tagged_words[word] == 'ADJ':
                print( word )
        print(f"There are { freq_pos.get('ADV',0) } adverbs:")
        for word in tagged_words:
            if tagged_words[word] == 'ADV':
                print( word )
                
        print('\n\n')
        
        out.write(f"{get_title(text)},{heteronym},{nr_tokens}," )
        out.write(f"{len(words)},{freq_pos.get('ADJ',0)/nr_tokens}," )
        out.write(f"{freq_pos.get('ADV',0)/nr_tokens}")
        out.write(f"{freq_pos.get('VERB',0)/nr_tokens},{freq_pos.get('NOUN',0)/nr_tokens}")
        out.write('\n')
        
out.close()    
        

print('Done!')

Analysing RR_Sob_a_leve_tutela ... 
Heteronym: Ricardo_Reis
The poem has 68 words in total:
The poem has 44 unique words (types):
['sob', 'a', 'leve', 'tutela', 'de', 'deuses', 'descuidosos', ',', 'quero', 'gastar', 'as', 'concedidas', 'horas', 'esta', 'fadada', 'vida', '.', 'nada', 'podendo', 'contra', 'o', 'ser', 'que', 'me', 'fizeram', 'desejo', 'menos', 'haja', 'fado', 'dado', 'paz', 'por', 'destino', 'verdade', 'não', 'mais', ';', 'os', 'dão', 'e', 'nem', 'talvez', 'saibam', 'qual']
There are 3 adjectives:
leve
descuidosos
fadada
There are 5 adverbs:
nada
não
mais
talvez



Analysing RR_Uns_com_os_olhos_postos_no_passado ... 
Heteronym: Ricardo_Reis
The poem has 98 words in total:
The poem has 57 unique words (types):
['uns', ',', 'com', 'os', 'olhos', 'postos', 'em', 'o', 'passado', 'vêem', 'que', 'não', ';', 'outros', 'fitos', 'mesmos', 'futuro', 'pode', 'ver', 'se', '.', 'porque', 'tão', 'longe', 'ir', 'pôr', 'está', 'perto', '—', 'a', 'segurança', 'nossa', '?', 'este', 'é', 'd

The poem has 2349 words in total:
The poem has 823 unique words (types):
['ode', 'triunfal', 'a', 'dolorosa', 'luz', 'de', 'as', 'grandes', 'lâmpadas', 'eléctricas', 'fábrica', 'tenho', 'febre', 'e', 'escrevo', '.', 'rangendo', 'os', 'dentes', ',', 'fera', 'para', 'beleza', 'isto', 'totalmente', 'desconhecida', 'antigos', 'ó', 'rodas', 'engrenagens', 'r-rrrrrr', 'rr', 'eterno', '!', 'forte', 'espasmo', 'retido', 'maquinismos', 'em', 'fúria', 'fora', 'dentro', 'mim', 'por', 'todos', 'meus', 'nervos', 'dissecados', 'todas', 'papilas', 'tudo', 'com', 'que', 'eu', 'sinto', 'lábios', 'secos', 'ruídos', 'modernos', 'vos', 'ouvir', 'demasiadamente', 'perto', 'arde', 'me', 'cabeça', 'querer', 'cantar', 'um', 'excesso', 'expressão', 'minhas', 'sensações', 'contemporâneo', 'vós', 'máquinas', 'olhando', 'motores', 'como', 'uma', 'natureza', 'tropical', '—', 'trópicos', 'humanos', 'ferro', 'fogo', 'força', 'canto', 'o', 'presente', 'também', 'passado', 'futuro', 'porque', 'é', 'todo', 'há', 'platã

The poem has 701 words in total:
The poem has 199 unique words (types):
['v', 'há', 'metafísica', 'bastante', 'em', 'não', 'pensar', 'nada', '.', 'o', 'que', 'penso', 'eu', 'de', 'mundo', '?', 'sei', 'lá', '!', 'se', 'adoecesse', 'pensaria', 'isso', 'ideia', 'tenho', 'as', 'coisas', 'opinião', 'sobre', 'causas', 'e', 'os', 'efeitos', 'meditado', 'deus', 'a', 'alma', 'criação', 'para', 'mim', 'é', 'fechar', 'olhos', 'correr', 'cortinas', 'minha', 'janela', '(', 'mas', 'ela', 'tem', ')', 'mistério', 'único', 'haver', 'quem', 'pense', 'está', 'sol', 'fecha', ',', 'começa', 'saber', 'muitas', 'cheias', 'calor', 'abre', 'vê', 'já', 'pode', 'porque', 'luz', 'vale', 'mais', 'pensamentos', 'todos', 'filósofos', 'poetas', 'sabe', 'faz', 'por', 'erra', 'comum', 'boa', 'têm', 'aquelas', 'árvores', 'serem', 'verdes', 'copadas', 'terem', 'ramos', 'dar', 'fruto', 'sua', 'hora', 'nos', 'nós', 'sabemos', 'elas', 'melhor', 'vivem', 'nem', 'sabem', '«', 'constituição', 'íntima', '»', 'sentido', 'íntimo'