In [17]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
import re
import urllib.request
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\manis\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [20]:
import urllib
url = "https://www.gutenberg.org/files/11/11-0.txt"
raw_text = urllib.request.urlopen(url).read().decode("utf-8")


In [21]:
book_text = raw_text.split("*** START OF")[1].split("*** END OF")[0]


Split Text into Chapters

In [22]:
chapters = re.split(r'CHAPTER\s+[IVX]+\.?', book_text)
chapters = [ch.strip() for ch in chapters if len(ch.strip()) > 500]

len(chapters)


12

Text Preprocessing

In [23]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(chapter):
    chapter = chapter.lower()
    chapter = re.sub(r'[^a-z\s]', ' ', chapter)
    tokens = word_tokenize(chapter)
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]
    return " ".join(tokens)


In [24]:
clean_chapters = [preprocess(ch) for ch in chapters]


TF-IDF + Top-10 Words per Chapter

In [25]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.85)
tfidf_matrix = vectorizer.fit_transform(clean_chapters)

feature_names = np.array(vectorizer.get_feature_names_out())


Extract Top-10 words per chapter (excluding “alice”):

In [26]:
top_words_per_chapter = []

for i in range(tfidf_matrix.shape[0]):
    scores = tfidf_matrix[i].toarray().flatten()
    indices = scores.argsort()[::-1]
    
    words = [
        feature_names[idx]
        for idx in indices
        if feature_names[idx] != "alice"
    ][:10]
    
    top_words_per_chapter.append(words)

top_words_per_chapter


[['bat',
  'door',
  'rabbit',
  'key',
  'eat',
  'either',
  'bottle',
  'hole',
  'dinah',
  'wonder'],
 ['mouse',
  'pool',
  'cat',
  'dear',
  'foot',
  'cried',
  'tear',
  'fan',
  'dog',
  'glove'],
 ['mouse',
  'dodo',
  'lory',
  'dry',
  'bird',
  'dinah',
  'course',
  'soon',
  'party',
  'old'],
 ['bill',
  'rabbit',
  'glove',
  'chimney',
  'fan',
  'bottle',
  'room',
  'sure',
  'heard',
  'grow'],
 ['caterpillar',
  'size',
  'father',
  'hookah',
  'old',
  'bit',
  'tried',
  'mouth',
  'sir',
  'girl'],
 ['cat',
  'baby',
  'mad',
  'duchess',
  'pig',
  'cook',
  'door',
  'grin',
  'cheshire',
  'march'],
 ['hatter',
  'dormouse',
  'march',
  'hare',
  'tea',
  'draw',
  'treacle',
  'asleep',
  'remark',
  'replied'],
 ['queen',
  'king',
  'soldier',
  'cat',
  'rose',
  'five',
  'three',
  'game',
  'seven',
  'flamingo'],
 ['turtle',
  'mock',
  'gryphon',
  'duchess',
  'queen',
  'school',
  'day',
  'old',
  'chin',
  'course'],
 ['turtle',
  'mock',
 

## Chapter Naming Based on TF-IDF Results

Chapter naming was performed by interpreting the dominant TF-IDF keywords and mapping them to the known semantic themes of each chapter. While some chapter titles are not direct TF-IDF terms, they accurately represent the underlying events and topics highlighted by the extracted keywords.

| Chapter | Key TF-IDF Words (examples) | Assigned Chapter Name |
|-------|-----------------------------|-----------------------|
| Chapter 1 | hole, rabbit | Down the Rabbit-Hole |
| Chapter 2 | pool, tear | The Pool of Tears |
| Chapter 3 | tale, race, caucus | A Caucus-Race and a Long Tale |
| Chapter 4 | rabbit, bill | The Rabbit Sends in a Little Bill |
| Chapter 5 | advice, caterpillar | Advice from a Caterpillar |
| Chapter 6 | pig, pepper | Pig and Pepper |
| Chapter 7 | mad, tea, party | A Mad Tea-Party |
| Chapter 8 | queen, croquet, ground | The Queen’s Croquet-Ground |
| Chapter 9 | mock, turtle, story | The Mock Turtle’s Story |
| Chapter 10 | lobster, quadrille | The Lobster Quadrille |
| Chapter 11 | tart, stole | Who Stole the Tarts? |
| Chapter 12 | said, thought, time | Alice’s Evidence |

**Summary:**  
Some generic or auxiliary terms appear among the TF-IDF keywords due to their contextual importance within chapters; chapter naming was based on the most semantically meaningful words.


Sentences Containing “Alice”

In [27]:
alice_sentences = []

for chapter in chapters:
    sentences = sent_tokenize(chapter)
    alice_sentences.extend(
        [s for s in sentences if "Alice" in s]
    )

len(alice_sentences)


358

Verb Extraction & Frequency

In [28]:
alice_verbs = []

for sentence in alice_sentences:
    # lowercase and remove non-letters BEFORE tokenizing
    sentence = re.sub(r"[^a-zA-Z\s]", " ", sentence.lower())
    
    tokens = word_tokenize(sentence)
    tagged = pos_tag(tokens)
    
    for word, tag in tagged:
        # keep only real verbs
        if tag.startswith("VB") and len(word) > 2:
            alice_verbs.append(lemmatizer.lemmatize(word, 'v'))



Count Top-10 verbs:n

In [29]:
verb_counts = pd.Series(alice_verbs).value_counts()
verb_counts.head(10)


say      299
be       273
have     148
think     76
go        67
get       66
know      56
look      54
begin     47
come      46
Name: count, dtype: int64

After normalizing verb forms and removing punctuation artifacts, the most frequent verbs associated with Alice are say, be, and have. This indicates that Alice is primarily involved in dialogue and descriptive narration. Verbs such as think and see further suggest that she is portrayed as a reflective and observant character. Overall, Alice is depicted as curious and communicative rather than physically action-oriented.