# Natural Language Processing of Books Using Python
The book title is Miracle in The Andes

## Load the book

In [2]:
import pandas as pd

In [4]:
# file.read() returns the entire content as a single string
# file.readlines() returns a list of strings
with open("miracle_in_the_andes.txt", "r", encoding="utf-8") as file:
    book = file.read()

## How many chapters?

### With strings method

In [8]:
book.count("Chapter")

11

- Actually in this book, there are only 10 chapters, but with string method the count is 11. It happens because, there are word chapter inside the paragraphs. So, that's why we need Regex to count specific pattern.

### With Regex

In [9]:
import re

In [10]:
#single digit capture
pattern = re.compile("Chapter [0-9]")
re.findall(pattern, book)

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 1']

There are 10 chapter, but unfortunately Chapter 10 displayed with chapter 1. It happens because ("Chapter [0-9]") only chapter 1 digit, so we need to add +

In [11]:
#multiple digit capture
pattern = re.compile("Chapter [0-9]+")
findings = re.findall(pattern, book)
findings

['Chapter 1',
 'Chapter 2',
 'Chapter 3',
 'Chapter 4',
 'Chapter 5',
 'Chapter 6',
 'Chapter 7',
 'Chapter 8',
 'Chapter 9',
 'Chapter 10']

In [12]:
len(findings)

10

The result already right, there are 10 chapters in this book

In [13]:
# if we find chapter pattern alphabetical
pattern = re.compile("Chapter [a-z]+")
findings = re.findall(pattern, book)
len(findings)

0

## Which are the sentences where "love" was used?

In [15]:
# if we find pattern only word 'love'
pattern = re.compile("love")
findings = re.findall(pattern, book)
findings[:5]

['love', 'love', 'love', 'love', 'love']

In [16]:
#[a-zA-Z] = start or end with alphabet
#* = 0 or more letters after the space
pattern = re.compile("[a-zA-Z]* love [a-zA-Z]*")
findings= re.findall(pattern, book)
findings[:5]

['passionate love for',
 'a love of',
 'to love the',
 'in love with',
 'the love and']

In [17]:
#[^.]* = negation point(.) ==> select all except (.)
pattern = re.compile("[^.]* love [^.]*.")
findings= re.findall(pattern, book)
findings[:5]

[' As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.',
 ' Guido and I grew up together, playing soccer and sharing a love of motorcycles, cars, and auto racing.',
 ' Under the guidance of the Christian Brothers, both of us grew to love the game of rugby with a consuming passion.',
 ' That rowdiness came to an abrupt end for Guido in 1969, when he met and fell in love with the beautiful daughter of a Chilean diplomat.',
 ' I believe he had a great hunger for the love and comforts of a family that was happy and whole.']

In [18]:
len(findings)

49

- With negation method, we could select all letters before or after "love" except point (.) But there are lackness where possibililty pattern "love," couldn't be detect, because  ([^.]* love) there are space after *
- The solution not input space in pattern

In [19]:
# [^.]* = select all letters except point (.)
# [^a-zA-Z]+ = select multiple alphabet, because [^a-zA-Z] only select one alphabet
# . = select point for the end, because it marks the end of the paragraph
pattern =  re.compile("[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings= re.findall(pattern, book)
len(findings)

67

See the findings is more than the pattern before ("[^.]* love [^.]*.")

In [20]:
# start the sentence with capital letter 1 times
pattern =  re.compile("[A-Z]{1}[^.]*[^a-zA-Z]+love[^a-zA-Z]+[^.]*.")
findings= re.findall(pattern, book)
len(findings)

67

In [21]:
findings[:1]

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.']

## Most common words

In [23]:
# if we find 1 single letter
pattern = re.compile("[a-zA-Z]")
findings= re.findall(pattern, book)
findings[:5]

['C', 'h', 'a', 'p', 't']

In [25]:
# if we select multiple letters
pattern = re.compile("[a-zA-Z]+")
findings= re.findall(pattern, book.lower())
findings[:5]

['chapter', 'before', 'it', 'was', 'friday']

In [26]:
len(findings)

86798

In [27]:
# compute frequency each words
# example 'chapter' in d.keys, it will count, and +1 for next loop until sum total each words
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [86]:
# sort descending
# the reason value first, because if we put key first, it will be sorted alphabetically
d_list = [(value, key) for (key,value) in d.items()]
d_list = sorted(d_list, reverse=True)
d_list[:4]

[(5346, 'the'), (2795, 'and'), (2729, 'i'), (2400, 'to')]

- The **most common words** used is **'the'** with 5346 frequency.
- Actually 'the' is **stop-words** in Natural Language Processing.
- Stop words usually ignored, because we want to get **actual words**

In [87]:
# if we put (key, value), not (value, key)
d_list = [(key,value) for (key,value) in d.items()]
sorted(d_list[:4], reverse=True)

[('was', 1430), ('it', 800), ('chapter', 11), ('before', 93)]

It will be sorted alphabetically, not the most common words

## Extract the paragraphs where "love" was used

In [31]:
#[^\n]+ = negation breaklines => select everything except breaklines
pattern = re.compile("[^\n]+love[^\n]+")
findings = re.findall(pattern, book)
findings[:1]

['To me, this is the essence of rugby. No other sport gives you such an intense sense of selflessness and unified purpose. I believe this is why rugby players all over the world feel such a passion for the game and such a feeling of brotherhood. As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives. For eight years we played our hearts out for the Christian Brothers—a brotherhood of young boys with Latin names, playing a game with deep Anglo roots under Uruguay’s sunny skies, and proudly wearing the bright green shamrock on our uniforms. The game became so much a part of our lives, in fact, that when we graduated from Stella Maris at the age of sixteen, many of us could not bear the thought that our playing days were over. Our salvation came in the form of

## Extract the chapter titles

### Method 1

In [32]:
#on dataset after the title, there are 2 breaklines
#("[a-zA-Z]+\n\n") = breaklines included in result findings
pattern = re.compile("[a-zA-Z]+\n\n")
findings = re.findall(pattern, book)

#strip breaklines
findings = [item.strip("\n\n") for item in findings]
findings[:5]

['Before', 'Precious', 'Promise', 'More', 'Abandoned']

### Method 2

In [34]:
#("([a-zA-Z]+)\n\n") = breaklines only included in pattern, but not included in result
pattern = re.compile("([a-zA-Z]+)\n\n")
findings = re.findall(pattern, book)
findings[:5]

['Before', 'Precious', 'Promise', 'More', 'Abandoned']

## Function that finds the occurence of any word

In [46]:
def find(w):
    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, book)
    
    d = {}
    for word in findings:
        if word in d.keys():
            d[word] = d[word] + 1
        else:
            d[word] = 1
    try :
        return d[w]
    except :
        return f'The book does not contain the word "{w}"'

['Before', 'Precious', 'Promise', 'More', 'Abandoned', 'Tomb', 'East', 'Death', 'Man', 'After']


### Call the function

In [47]:
find('the')

5013

In [51]:
find('hate')

'The book does not contain the word "hate"'

## The most used words (non-articles)

In [53]:
# python version
from platform import python_version
python_version()

'3.11.4'

In [55]:
#library nltk
import nltk

In [58]:
# english stopwords
from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")

In [88]:
english_stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [63]:
d_list[:5]

[(5346, 'the'), (2795, 'and'), (2729, 'i'), (2400, 'to'), (2060, 'of')]

In [64]:
# select non stopwords
filtered_words = []

for count, word in d_list:
    if word not in english_stopwords:
        filtered_words.append((count, word))

In [89]:
filtered_words[:5]

[(575, 'would'), (519, 'us'), (292, 'said'), (284, 'roberto'), (252, 'could')]

The most used words (non-articles) is 'would' with 575 times.

## Sentiment Analysis : What is the most positive and the most negative chapter?

### Example

In [69]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [70]:
analyzer = SentimentIntensityAnalyzer()

In [71]:
# Find directory of SentimentIntensityAnalyzer()
dir(analyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_amplify_ep',
 '_amplify_qm',
 '_but_check',
 '_idioms_check',
 '_least_check',
 '_never_check',
 '_punctuation_emphasis',
 '_sift_sentiment_scores',
 'constants',
 'lexicon',
 'lexicon_file',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [72]:
# Use polarity_scores
analyzer.polarity_scores("Hey, look how fun the movie is. I love the main characters")

{'neg': 0.0, 'neu': 0.545, 'pos': 0.455, 'compound': 0.8176}

In [74]:
# Use polarity_scores
scores = analyzer.polarity_scores("Hey, look how bad the movie is. I hate the main characters")
scores

{'neg': 0.444, 'neu': 0.556, 'pos': 0.0, 'compound': -0.802}

- neg = negativity ==> range (0 to 1)
- neu = neutrality ==> range (0 to 1)
- pos = positivity ==> range (0 to 1)
- compound = compound coefficient ==> range (-1 to 1)
- compound >0 ==> more positivity
- compound <0 ==> more negativity

In [75]:
if scores['pos'] > scores['neg']:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a negative text


### Chapters sentiment analysis

In [76]:
# Sentiment analysis for 1 book
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

In [77]:
# Select chapter from book
pattern = re.compile("Chapter [0-9]+")
pattern

re.compile(r'Chapter [0-9]+', re.UNICODE)

In [78]:
# Split pattern chapter from book
chapters = re.split(pattern, book)

In [80]:
# In chapter there is '' empty string
# we need to get rid of them
len(chapters)

11

See the chapters actually 10, but because there is empty string (''), so the total is gonna be 11.

In [81]:
# Select all except empty string
chapters = chapters[1:]

In [83]:
# Sentiment analysis each chapter
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr + 1, scores)

1 {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
2 {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
3 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
4 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
5 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
6 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
7 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
8 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
9 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
10 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
