## Opening the book

In [1]:
with open("miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book = file.read()

## The most used words (non-articles)

In [2]:
import re
pattern = re.compile("[a-zA-Z]+")
finding = re.findall(pattern, book.lower())
len(finding)

86798

In [3]:
finding[:10]

['chapter',
 'before',
 'it',
 'was',
 'friday',
 'the',
 'thirteenth',
 'of',
 'october',
 'we']

#### constructing dictionary

In [4]:
d = {}
for word in finding:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

#### Making dictionary with number before word, then sorting

In [5]:
d_list = [(value, key) for (key, value) in d.items()]
d_list = sorted(d_list, reverse = True)
sorted(d_list, reverse = True)[:15]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my'),
 (1001, 'that'),
 (946, 'he'),
 (941, 'had'),
 (800, 'it'),
 (705, 'for')]

## installing nltk and import

In [6]:
from platform import python_version
python_version()

'3.12.2'

In [7]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nenad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### seting up english stopwords (that mean nothing on it own)

In [10]:
english_stopwords = stopwords.words("english")

In [12]:
# english_stopwords

#### Filtering out stopwords from all words count in book

In [13]:
filtered_words = []
for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((word, count))
filtered_words[:20]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165),
 ('way', 164),
 ('life', 161),
 ('knew', 155),
 ('mountains', 147),
 ('fuselage', 140),
 ('still', 137),
 ('felt', 127),
 ('father', 127),
 ('others', 126),
 ('found', 126)]

## Sentiment analysis, most positive/negative chapter

In [30]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon');

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Nenad\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [34]:
analyzer = SentimentIntensityAnalyzer()

In [35]:
analyzer
nltk.sentiment.vader.SentimentIntensityAnalyzer

nltk.sentiment.vader.SentimentIntensityAnalyzer

In [36]:
scores = analyzer.polarity_scores("Hey, look how beautiful the trees are.  I love them.")

In [37]:
scores

{'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'compound': 0.8442}

In [38]:
if scores["pos"] > scores["neg"]:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a positive text


## Chapters sentiment analysis

In [40]:
import re
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)

In [43]:
chapters = chapters[1:]

In [47]:
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr, scores)

0 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
1 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
2 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
3 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
4 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
5 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
6 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
7 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
8 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
9 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
