In [None]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
from os.path import join
import stanza

## Download the texts for arquivopessoa.net

In [None]:
dir = 'Corpus'
if not os.path.isdir( dir ):
    os.mkdir( dir )

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import re

for i in range(0,5000):
    
    url = f'http://arquivopessoa.net/textos/{i}'
    print(url)

    response = requests.get(url)
    if response:
        response.encoding = 'utf-8'
        html_page = response.text
        soup = BeautifulSoup( html_page,"lxml")

        author = soup.find_all('div', {'class': 'autor'} )
        title = soup.find_all('h1', {'class': 'titulo-texto'} )
        poem = soup.find_all('div', {'class': 'texto-poesia'} )
        date = soup.find_all('div', {'class': 'data'} )

        dir_name = re.sub( r'\s' , '_' , author[0].text )
        
        path = os.path.join('Corpus' , dir_name)
        if not os.path.isdir( path ):
            os.mkdir( path )
            
        if len(poem) > 0:

            file_name = re.sub( r'\s' , '_' , title[0].text.strip() )
            file_name = re.sub( r'[/]' , '' , file_name )
            file_name = re.sub( r'[,.]' , '' , file_name )
            out = open( f'{os.path.join( path , file_name )}.txt' , 'w' , encoding = 'utf-8')
            out.write( str(poem[0].text) )
            out.close()

## Create a list of all the files in the corpus

In [None]:
texts = []
dir = 'Corpus'

for entry in os.listdir(dir):
    # dir + subdirectory
    path = os.path.join(dir,entry)
    if os.path.isdir(path):
        
        for file in os.listdir(path):
            # dir + subdirectory + file 
            file_path = join(path,file)
            texts.append( file_path )
            print( file_path )


## create text analysis functions

In [None]:
stanza.download('pt')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('pt')

def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

## create data for all  the texts in the corpus

In [None]:

    


#print(*[f'word: {word.text}\tupos: {word.upos}\txpos: {word.xpos}\tfeats: {word.feats if word.feats else "_"}' for sent in doc.sentences for word in sent.words], sep='\n')


In [None]:
out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

pos_tags = ['ADJ' , 'ADV' , 'VERB' , 'NOUN']

## Header of the CSV file
out.write('title,heteronym,tokens,sentences,ttr')

for t in pos_tags:
    out.write(f',{t}')
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    path = os.path.normpath(path)
    parts = path.split(os.sep)
    data['heteronym'] = parts[1].strip()
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    ## count the number of sentences
    sentences = sent_tokenize(full_text)
    data['nr_sentences'] = len(sentences)
    
    # dictionary to count the POS tags
    freq_pos = dict()    
    
    # variables for the calculation of type-token ratio
    ttr_cap = 3000
    freq_ttr = dict()
 
    # token count is initalised at 0
    data['nr_tokens'] = 0

    for s in sentences:
        words = word_tokenize(s)
        words = remove_punctuation(words)
        
        doc = nlp(s)
    
        for sent in doc.sentences:
            for word in sent.words:
                word.text = word.text.lower()
                #print(word.text , ' => ' , word.upos)
    
                # count the tokens
                data['nr_tokens'] += 1

                # place tokens in dictionary freq_ttr
                # only if the word count is less than ttr_cap
                # The nr of items in the dictionary eventually equals the nr of types
                if data['nr_tokens'] <= ttr_cap:
                    freq_ttr[ word.text ] = freq_ttr.get( word.text , 0 ) + 1
                
            ## Count frequencies of all the POS tags
            freq_pos[ word.upos ] = freq_pos.get( word.upos ,0) +1
            

    for t in pos_tags:
        data[t] = freq_pos.get(t,0)
                
    # Calculate TTR: number of items in freq_ttr dictyionary
    # divided by ttr_cap
    data['ttr'] = len( freq_ttr ) / ttr_cap
    
    # write the results to a CSV file    
    out.write( f"{title},{data['heteronym']},{data['nr_tokens']},{data['nr_sentences']},{data['ttr']}" )
    for t in pos_tags:
        out.write( f",{data[t] / data['nr_tokens'] }"  )
    out.write('\n')
    
out.close()
print('Done!')