In [24]:
import re
import pandas as pd
from collections import Counter

# Read text

In [25]:
with open('11-0.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [26]:
pattern = r'(CHAPTER\s+[IVXLCDM]+\.\s*\n.*?\n\n)'
chapters = re.split(pattern, text)

chapter_list = []
chapter_titles = []

In [27]:
for i in range(1, len(chapters), 2):
  chapter_header = chapters[i].strip()
  chapter_content = chapters[i+1].strip() if i+1 < len(chapters) else ""

  chapter_list.append(chapter_content)
  chapter_titles.append(chapter_header)

cleaned_titles= [title.replace('\n', ' ') for title in chapter_titles]

In [28]:
df = pd.DataFrame({
  'Chapter_Title': cleaned_titles,
  'Content': chapter_list,
})
df

Unnamed: 0,Chapter_Title,Content
0,CHAPTER I. Down the Rabbit-Hole,Alice was beginning to get very tired of sitti...
1,CHAPTER II. The Pool of Tears,“Curiouser and curiouser!” cried Alice (she wa...
2,CHAPTER III. A Caucus-Race and a Long Tale,They were indeed a queer-looking party that as...
3,CHAPTER IV. The Rabbit Sends in a Little Bill,"It was the White Rabbit, trotting slowly back ..."
4,CHAPTER V. Advice from a Caterpillar,The Caterpillar and Alice looked at each other...
5,CHAPTER VI. Pig and Pepper,For a minute or two she stood looking at the h...
6,CHAPTER VII. A Mad Tea-Party,There was a table set out under a tree in fron...
7,CHAPTER VIII. The Queen’s Croquet-Ground,A large rose-tree stood near the entrance of t...
8,CHAPTER IX. The Mock Turtle’s Story,“You can’t think how glad I am to see you agai...
9,CHAPTER X. The Lobster Quadrille,"The Mock Turtle sighed deeply, and drew the ba..."


# 10 most important words besides Alice in each chapter.

In [29]:
content_list = df.Content.str.lower()

In [30]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
stop_words.append("wa")

↑↑↑Here, "was" will be processed as "wa", therefore the word will be added to the stop word list.↑↑↑

In [32]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
punct = string.punctuation + "’" + "“" + "”"+ "‘"
def top_10_words(contents):
  contents = word_tokenize(contents)
  contents = [content for content in contents if content not in punct]
  contents = [lemmatizer.lemmatize(content) for content in contents]
  contents = [content for content in contents if content not in stop_words]
  word_counts = Counter(contents)
  del word_counts['alice']
  return word_counts.most_common(10)

In [34]:
chapter_stats = {}

for index, row in df.iterrows():
  match = re.match(r'CHAPTER\s+[IVXLCDM]+\.', row['Chapter_Title'])
  chapter_stats[match.group()] = top_10_words(content_list[index])

In [35]:
from tabulate import tabulate

table_data = []
for chapter, words in chapter_stats.items():
  for rank, (word, count) in enumerate(words, 1):
    table_data.append([chapter, rank, word, count])

print(tabulate(table_data, headers=['Chapter', 'Rank', 'Word', 'Count'], tablefmt='grid', maxcolwidths=[12, 6, 15, 8]))

+--------------+--------+-------------+---------+
| Chapter      |   Rank | Word        |   Count |
| CHAPTER I.   |      1 | little      |      15 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      2 | way         |      11 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      3 | like        |      11 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      4 | see         |      10 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      5 | think       |       9 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      6 | door        |       9 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      7 | thought     |       8 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      8 | could       |       8 |
+--------------+--------+-------------+---------+
| CHAPTER I.   |      9 | time        |       8 |
+--------------+--------+-------------+---------+


My title (I haven't read this book before)
1. The Way Behind the Door
2. Dear Little Mouse
3. Mouse and Bird
4. The Rabbit's Words
5. Thinking Time
6. Footman
7. Dormouse Hatter
8. Cat,  King, and Queen
9. Duchess
10. Beautiful Lobster Soup
11. Witness
12. The Jury and The Little White Queen



# Find the Top 10 most used verbs in sentences with Alice.

In [36]:
nltk.download('averaged_perceptron_tagger_eng')
stemmer = PorterStemmer()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [37]:
sentences = nltk.sent_tokenize(text)
alice_sentences = [s for s in sentences if re.search(r'\balice\b', s, re.IGNORECASE)]

In [38]:
verbs = []

for sentences in alice_sentences:
  words = word_tokenize(sentences)
  words = [word for word in words if word not in punct]
  words = [stemmer.stem(word) for word in words]
  words = [word for word in words if word not in stop_words]
  pos_tags = nltk.pos_tag(words)

  for word, tag in pos_tags:
    if tag.startswith('VB'):
      verbs.append(word)

verbs_counts = Counter(verbs)
verbs_counts.most_common(10)

[('said', 257),
 ('thought', 50),
 ('go', 44),
 ('went', 41),
 ('say', 39),
 ('get', 28),
 ('see', 28),
 ('know', 27),
 ('got', 26),
 ('come', 26)]

Alice usually talks, thinks, and then goes somewhere else.