In [25]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
import requests

## Acquire the texts

In [26]:
## create subdirectories in Corpus

if not os.path.exists('Corpus'):
    os.mkdir('Corpus')
    
path1 = os.path.join('Corpus' , 'Austen')
if not os.path.exists( path1 ):
    os.mkdir( path1 )  
    
path2 = os.path.join('Corpus' , 'Scott')
if not os.path.exists( path2 ):
    os.mkdir( path2 )  

In [27]:
## Download files from Gutenberg Metadata

import pandas as pd
import re
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

github = 'https://raw.githubusercontent.com/peterverhaar/introduction_to_dh/main/'
url_csv = github + 'gutenberg_metadata.csv'

response = requests.get(url_csv)

if response:
    response.encoding = 'utf-8' 
    with open( 'gutenberg_metadata.csv' , 'w' , encoding = 'utf-8' ) as f:
        f.write(response.text)

md = pd.read_csv( 'gutenberg_metadata.csv')


def download_files(df , dir_name ):

    for index,row in df.iterrows():
        url = row['url']
        title = row['title']
        title = re.sub( r'\s+' , '_' , title)
        title = re.sub( r'[/]' , '_' , title)
        title = re.sub( r'\n' , '_' , title)
        print( f"Downloading {row['author']}, {title} ... " )
        

        response = requests.get(url)

        if response:
            response.encoding = 'utf-8' 
            full_text = response.text
            full_text = remove_pg_boilerplate(full_text)
            
            path = os.path.join( dir_name , f'{title}.txt' )
            print(path)
            out = open( path , 'w' , encoding = 'utf-8' )
            out.write( full_text )
            out.close()
            
md = md[ md['language'] == 'en'  ]       
            
md_scott = md[ md['author'] == 'Scott Walter'  ]
dir_name = os.path.join( 'Corpus' , 'Scott' )
download_files( md_scott , dir_name )

md_austen = md[ md['author'] == 'Austen Jane'  ]
dir_name = os.path.join( 'Corpus' , 'Austen' )
download_files( md_austen , dir_name )

Downloading Scott Walter, Ivanhoe:_A_Romance ... 
Corpus/Scott/Ivanhoe:_A_Romance.txt
Downloading Scott Walter, The_Bride_of_Lammermoor ... 
Corpus/Scott/The_Bride_of_Lammermoor.txt
Downloading Scott Walter, The_Talisman ... 
Corpus/Scott/The_Talisman.txt
Downloading Scott Walter, The_Black_Dwarf ... 
Corpus/Scott/The_Black_Dwarf.txt
Downloading Scott Walter, A_Legend_of_Montrose ... 


KeyboardInterrupt: 

## Create a list of all the files in the corpus

In [28]:

texts = []

# path1 = os.path.join( 'Corpus' , 'Scott' )
# path2 = os.path.join( 'Corpus' , 'Scott' )
subcorpora = ['Scott','Austen']


author_dir = dict()

for dir in subcorpora:
    path = os.path.join( 'Corpus' , dir )
    for file in os.listdir(path):
        if re.search( r'txt$' , file ):
            path = os.path.join( 'Corpus' , dir , file )
            author_dir[path] = dir
            texts.append(path)



## create text analysis functions

In [29]:


def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

## create data for all  the texts in the corpus

In [30]:
out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

pos_tags = ['JJ' , 'MD' , 'JJR' , 'JJS' , 'VBD']

## Header of the CSV file
out.write('title,author,tokens,sentences,ttr')

for t in pos_tags:
    out.write(f',{t}')
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    ## count the number of sentences
    sentences = sent_tokenize(full_text)
    data['nr_sentences'] = len(sentences)
    
    # dictionary to count the POS tags
    freq_pos = dict()    
    
    # variables for the calculation of type-token ratio
    ttr_cap = 3000
    freq_ttr = dict()
 
    # token count is initalised at 0
    data['nr_tokens'] = 0

    for s in sentences:
        words = word_tokenize(s)
        words = remove_punctuation(words)
        
        tags = pos_tag(words)
        # Each tag consists of two values: 
        # [0]: the word and [1] the POS tag
        for word_tag in tags:
            word = word_tag[0]
            tag = word_tag[1]
            
            # count the tokens
            data['nr_tokens'] += 1
            
            # place tokens in dictionary freq_ttr
            # only if the word count is less than ttr_cap
            # The nr of items in the dictionary eventually equals the nr of types
            if data['nr_tokens'] <= ttr_cap:
                freq_ttr[ word ] = freq_ttr.get( word , 0 ) + 1
                
            ## Count frequencies of all the POS tags
            freq_pos[ tag ] = freq_pos.get( tag ,0) +1
            
    for t in pos_tags:
        data[t] = freq_pos.get(t,0)
                
    # Calculate TTR: number of items in freq_ttr dictyionary
    # divided by ttr_cap
    data['ttr'] = len( freq_ttr ) / ttr_cap
    
    # write the results to a CSV file    
    out.write( f"{title},{author_dir[text]},{data['nr_tokens']},{data['nr_sentences']},{data['ttr']}" )
    for t in pos_tags:
        out.write( f",{data[t] / data['nr_tokens'] }"  )
    out.write('\n')
    
out.close()
print('Done!')

Analysing Corpus/Scott/Redgauntlet:_A_Tale_Of_The_Eighteenth_Century.txt ...
Analysing Corpus/Scott/Guy_Mannering;_or_The_Astrologer_—_Complete.txt ...
Analysing Corpus/Scott/Kenilworth.txt ...
Analysing Corpus/Scott/The_Heart_of_Mid-Lothian_Complete.txt ...
Analysing Corpus/Scott/Old_Mortality_Complete.txt ...
Analysing Corpus/Scott/Rob_Roy_—_Volume_01.txt ...
Analysing Corpus/Scott/Minstrelsy_of_the_Scottish_Border_Volume_3_(of_3)\r\nConsisting_of_Historical_and_Romantic_Ballads_Collected_in_the_Southern_Counties_of_Scotland;_with_a_Few_of_Modern_Date_Founded_Upon_Local_Tradition.txt ...
Analysing Corpus/Scott/Life_of_Napoleon_Bonaparte_Volume_II..txt ...
Analysing Corpus/Scott/The_Heart_of_Mid-Lothian_Volume_2.txt ...
Analysing Corpus/Scott/The_Bride_of_Lammermoor.txt ...
Analysing Corpus/Scott/Life_of_Napoleon_Bonaparte_Volume_III..txt ...
Analysing Corpus/Scott/The_Monastery.txt ...
Analysing Corpus/Scott/Rob_Roy_—_Volume_02.txt ...
Analysing Corpus/Scott/A_Legend_of_Montrose.txt 

ZeroDivisionError: division by zero