## Counting Words

In [1]:
import matplotlib.pyplot as plt

In [2]:
text = "This is my test text. We're keeping text short to keep things manageable."

def count_words(text):
    
    """Counts the number of times a word can appear in a text """

    text = text.lower()
    skips = [".",",",";",":","'",'"']
    for ch in skips:
        text = text.replace(ch,"") 
    
    word_counts = {}
    for word in text.split(" "):
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1

    return word_counts

count_words(text)

{'this': 1,
 'is': 1,
 'my': 1,
 'test': 1,
 'text': 2,
 'were': 1,
 'keeping': 1,
 'short': 1,
 'to': 1,
 'keep': 1,
 'things': 1,
 'manageable': 1}

In [3]:
from collections import Counter

def count_words_fast(text):
    
    """Counts the number of times a word can appear in a text """

    text = text.lower()
    skips = [".",",",";",":","'",'"']
    for ch in skips:
        text = text.replace(ch,"") 
    
    word_counts = Counter(text.split(" "))
    return word_counts

count_words_fast(text)

# count_words_fast(text) is count_words(text)

Counter({'this': 1,
         'is': 1,
         'my': 1,
         'test': 1,
         'text': 2,
         'were': 1,
         'keeping': 1,
         'short': 1,
         'to': 1,
         'keep': 1,
         'things': 1,
         'manageable': 1})

## Reading in a Book

In [4]:
def read_book(title_path):
    """Opens a book and replaces '\n' and '\r' in the text"""
    with open(title_path,"r",encoding='utf8') as current_file:
        text = current_file.read()
        text = text.replace("\n","").replace("\r","")
    return text

text = read_book("./books/Books_EngFr/English/shakespeare/Romeo and Juliet.txt")
#len(text)
ind = text.find("What's in a name?")

sample_text = text[ind:ind + 1000]
sample_text

"What's in a name? That which we call a rose    By any other name would smell as sweet.    So Romeo would, were he not Romeo call'd,    Retain that dear perfection which he owes    Without that title. Romeo, doff thy name;    And for that name, which is no part of thee,    Take all myself.  Rom. I take thee at thy word.    Call me but love, and I'll be new baptiz'd;    Henceforth I never will be Romeo.  Jul. What man art thou that, thus bescreen'd in night,    So stumblest on my counsel?  Rom. By a name    I know not how to tell thee who I am.    My name, dear saint, is hateful to myself,    Because it is an enemy to thee.    Had I it written, I would tear the word.  Jul. My ears have yet not drunk a hundred words    Of that tongue's utterance, yet I know the sound.    Art thou not Romeo, and a Montague?  Rom. Neither, fair saint, if either thee dislike.  Jul. How cam'st thou hither, tell me, and wherefore?    The orchard walls are high and hard to climb,    And the place death, consid

## Computing word Freq Stats

In [5]:
def word_stats(word_counts):
    """Return number of unique words and word freq"""
    num_unique = len(word_counts) #cantidad de palabras totales (solo cuenta una por más que se repitan)
    counts = word_counts.values() # cantidad de palabras totales (por más que se repiten,
                                  # es como para saber el contenido total del texto)
    return (num_unique,counts)

word_counts = count_words(text)

In [6]:
en_text = read_book("./books/Books_EngFr/English/shakespeare/Romeo and Juliet.txt")
en_word_counts = count_words(en_text)
text_english = word_stats(en_word_counts)

de_text = read_book("./books/Books_GerPort/German/shakespeare/Romeo und Julia.txt")
de_word_counts = count_words(de_text)
text_deutsch = word_stats(de_word_counts)

In [None]:
# text = "This is my test text. We're keeping text short to keep things manageable."
word_counts = count_words(text)
word_stats(word_counts)

## Reading multiple files

In [11]:
import os
import pandas as pd

book_dir = './books'
stats = pd.DataFrame(columns = ("Language", "Author", "Title", "Length", "Unique words"))
title_num = 1

for folder in os.listdir(book_dir):
    for language in os.listdir(book_dir + '/' + folder):
        for author in os.listdir(book_dir + '/' + folder + '/' + language ):
            for title in os.listdir(book_dir + '/' + folder + '/' + language + '/' + author):
                inputfile = book_dir + '/' + folder + '/' + language + '/' + author + '/' + title
                text = read_book(inputfile)
                (num_unique , counts) = word_stats(count_words(text))
                stats.loc[title_num] = language,author, title, sum(counts), num_unique
                title_num += 1


NotADirectoryError: [Errno 20] Not a directory: './books/Books_EngFr/English/.DS_Store'