# Miracle in the Andes by Nando Parrado (2006)

## Load the data

In [2]:
with open("miracle_in_the_andes.txt", "r") as file:
    book = file.read()
book[:100]

'Chapter 1\n\nBefore\n\n\nIT WAS FRIDAY, the thirteenth of October. We joked about that—flying over the An'

## How many chapters

### With string methods

In [3]:
book.count("Chapter")

11

This is not true -> The method is not accurate -> Use regular expressions = regex

### With regex

In [4]:
import re

In [5]:
pattern = re.compile("Chapter [0-9]")
re.findall(pattern, book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 1']

Since we don't see number 10 in "Chapter 10", we want to adapt this to recognize such patterns also. To do that, use + at in the string we are searching for. This + indicates that there may be more than one digits. For letters, use the syntax [a-z]. 

In [9]:
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
print(findings)
len(findings)

['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7', 'Chapter 8', 'Chapter 9', 'Chapter 10']


10

## Find sentences containing the word "love"

In [19]:
pattern = re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
len(findings)
print(findings[0:2])

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.', 'Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.']


Use * to indicate 0 or more. Use + to indicate 1 or more. Use ^. to indicate everything but ".". Use [a-zA-Z] to indicate any letters, capital or not. Use [^.]*. to indicate anything but "." in any length and then set . at the end. Use [A-Z]{1} to indicate exactly one capital letter.

## What are the most used words

split() method is not a good option because eg. each word at the end of the sentence would be recognised as a new word together with the dot, similarly for words followed by comma etc.

In [22]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
print(findings[:5])

['chapter', 'before', 'it', 'was', 'friday']


In [25]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1
d

{'chapter': 11,
 'before': 93,
 'it': 800,
 'was': 1430,
 'friday': 1,
 'the': 5346,
 'thirteenth': 1,
 'of': 2060,
 'october': 8,
 'we': 1226,
 'joked': 3,
 'about': 134,
 'that': 1001,
 'flying': 15,
 'over': 101,
 'andes': 79,
 'on': 576,
 'such': 57,
 'an': 205,
 'unlucky': 1,
 'day': 88,
 'but': 679,
 'young': 34,
 'men': 23,
 'make': 88,
 'those': 74,
 'kinds': 3,
 'jokes': 3,
 'so': 333,
 'easily': 8,
 'our': 496,
 'flight': 26,
 'had': 941,
 'originated': 1,
 'one': 249,
 'earlier': 9,
 'in': 1419,
 'montevideo': 27,
 'my': 1169,
 'hometown': 1,
 'its': 53,
 'destination': 2,
 'santiago': 21,
 'chile': 26,
 'a': 1566,
 'chartered': 2,
 'fairchild': 75,
 'twin': 1,
 'engine': 5,
 'turboprop': 1,
 'carrying': 5,
 'rugby': 49,
 'team': 50,
 'old': 51,
 'christians': 19,
 'club': 5,
 'to': 2400,
 'play': 19,
 'exhibition': 2,
 'match': 14,
 'against': 42,
 'top': 26,
 'chilean': 20,
 'squad': 4,
 'there': 269,
 'were': 523,
 'forty': 5,
 'five': 21,
 'people': 30,
 'aboard': 1,
 'i

In [28]:
d_list = [(value, key) for (key, value) in d.items()]
sorted(d_list, reverse=True)

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my'),
 (1001, 'that'),
 (946, 'he'),
 (941, 'had'),
 (800, 'it'),
 (705, 'for'),
 (700, 'as'),
 (679, 'but'),
 (632, 'with'),
 (617, 'me'),
 (576, 'on'),
 (575, 'would'),
 (530, 'his'),
 (523, 'were'),
 (519, 'us'),
 (496, 'our'),
 (481, 'at'),
 (452, 'from'),
 (420, 's'),
 (381, 'you'),
 (373, 'all'),
 (360, 'they'),
 (350, 'him'),
 (333, 'so'),
 (320, 'this'),
 (316, 'be'),
 (292, 'said'),
 (290, 't'),
 (284, 'roberto'),
 (282, 'not'),
 (275, 'them'),
 (274, 'is'),
 (273, 'when'),
 (269, 'there'),
 (260, 'her'),
 (257, 'by'),
 (252, 'have'),
 (252, 'could'),
 (251, 'no'),
 (249, 'one'),
 (229, 'into'),
 (228, 'or'),
 (227, 'their'),
 (227, 'snow'),
 (209, 'more'),
 (205, 'an'),
 (201, 'what'),
 (198, 'now'),
 (198, 'if'),
 (195, 'then'),
 (183, 'mountain'),
 (182, 'time'),
 (181, 'she'),
 (181, 'are'),
 (177, 'been'),
 (176, 'will'),
 (174, 

## Extract the paragraphs where "love" was used

In [60]:
pattern = re.compile("[^\n]+love[^\n]+")
findings = re.findall(pattern, book)
print(findings[:2])

['To me, this is the essence of rugby. No other sport gives you such an intense sense of selflessness and unified purpose. I believe this is why rugby players all over the world feel such a passion for the game and such a feeling of brotherhood. As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives. For eight years we played our hearts out for the Christian Brothers—a brotherhood of young boys with Latin names, playing a game with deep Anglo roots under Uruguay’s sunny skies, and proudly wearing the bright green shamrock on our uniforms. The game became so much a part of our lives, in fact, that when we graduated from Stella Maris at the age of sixteen, many of us could not bear the thought that our playing days were over. Our salvation came in the form of

## Extract the chapter titles

In [43]:
pattern = re.compile("Chapter [0-9]+[\n]{2}[^\n]+[\n]{2}")
findings = re.findall(pattern, book)
print(findings)

pattern2 = re.compile("[a-zA-Z]+")
findings2 = [re.findall(pattern2, element) for element in findings]
print(findings2)

['Chapter 1\n\nBefore\n\n', 'Chapter 2\n\nEverything Precious\n\n', 'Chapter 3\n\nA Promise\n\n', 'Chapter 4\n\nBreathe Once More\n\n', 'Chapter 5\n\nAbandoned\n\n', 'Chapter 6\n\nTomb\n\n', 'Chapter 7\n\nEast\n\n', 'Chapter 8\n\nThe Opposite of Death\n\n', 'Chapter 9\n\nI See a Man\n\n', 'Chapter 10\n\nAfter\n\n']
[['Chapter', 'Before'], ['Chapter', 'Everything', 'Precious'], ['Chapter', 'A', 'Promise'], ['Chapter', 'Breathe', 'Once', 'More'], ['Chapter', 'Abandoned'], ['Chapter', 'Tomb'], ['Chapter', 'East'], ['Chapter', 'The', 'Opposite', 'of', 'Death'], ['Chapter', 'I', 'See', 'a', 'Man'], ['Chapter', 'After']]


In [61]:
chapters = [" ".join(findings2[i][1:]) for i in range(len(findings2))]
print(chapters)

['Before', 'Everything Precious', 'A Promise', 'Breathe Once More', 'Abandoned', 'Tomb', 'East', 'The Opposite of Death', 'I See a Man', 'After']


### better version:

In [63]:
pattern = re.compile("([a-zA-Z ]+)\n\n")
findings = re.findall(pattern, book)
print(findings)

['Before', 'Everything Precious', 'A Promise', 'Breathe Once More', 'Abandoned', 'Tomb', 'East', 'The Opposite of Death', 'I See a Man', 'After']


Here, the parantheses () are used to return only what is inside them. So the \n\n will not be returned.

## Function that finds number of occurences for any word

In [55]:
def occurences(word):
    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, book.lower())
    d = {}
    for item in findings:
        if item in d.keys():
            d[item] = d[item] + 1
        else:
            d[item] = 1
    if word in d.keys():
        return d[word]
    else:
        return f"The word {word} is not in the book."

In [56]:
occurences("and")

2795

In [57]:
occurences("roberto")

284

In [58]:
occurences("hate")

'The word hate is not in the book.'