# Down the Rabbit hole
Text analysis of "Alice's Adventures in Wonderland" and "Through the Looking Glass". Alice's Adventures in Wonderland is an 1865 novel written by English author Charles Lutwidge Dodgson under the pseudonym Lewis Carroll.


<img src ="../Down-the-rabbit-hole/image.png"/> 

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import FreqDist
import matplotlib.pyplot as plt

# Get chapters

In [2]:
with open("../Down-the-rabbit-hole/11-0.txt", encoding='utf8') as f:
    file= f.read()

In [3]:
raw = file.split("CHAPTER")[1:]


In [4]:
raw = pd.DataFrame({"chapter": raw})
raw.head()

Unnamed: 0,chapter
0,I. Down the Rabbit-Hole\n\nAlice was beginnin...
1,II. The Pool of Tears\n\n‘Curiouser and curio...
2,III. A Caucus-Race and a Long Tale\n\nThey we...
3,IV. The Rabbit Sends in a Little Bill\n\nIt w...
4,V. Advice from a Caterpillar\n\nThe Caterpill...


In [6]:
chapters = raw.copy()

# Get the chapter title, number, text
chapters['Chapter_Title'] = chapters.chapter.str.split("\n").str[0]
chapters['Chapter_Num'] = chapters.Chapter_Title.str.split(".").str[0]
chapters['Title'] = chapters.Chapter_Title.str.split(".").str[1]
chapters

Unnamed: 0,chapter,Chapter_Title,Chapter_Num,Title
0,I. Down the Rabbit-Hole\n\nAlice was beginnin...,I. Down the Rabbit-Hole,I,Down the Rabbit-Hole
1,II. The Pool of Tears\n\n‘Curiouser and curio...,II. The Pool of Tears,II,The Pool of Tears
2,III. A Caucus-Race and a Long Tale\n\nThey we...,III. A Caucus-Race and a Long Tale,III,A Caucus-Race and a Long Tale
3,IV. The Rabbit Sends in a Little Bill\n\nIt w...,IV. The Rabbit Sends in a Little Bill,IV,The Rabbit Sends in a Little Bill
4,V. Advice from a Caterpillar\n\nThe Caterpill...,V. Advice from a Caterpillar,V,Advice from a Caterpillar
5,VI. Pig and Pepper\n\nFor a minute or two she...,VI. Pig and Pepper,VI,Pig and Pepper
6,VII. A Mad Tea-Party\n\nThere was a table set...,VII. A Mad Tea-Party,VII,A Mad Tea-Party
7,VIII. The Queen’s Croquet-Ground\n\nA large r...,VIII. The Queen’s Croquet-Ground,VIII,The Queen’s Croquet-Ground
8,IX. The Mock Turtle’s Story\n\n‘You can’t thi...,IX. The Mock Turtle’s Story,IX,The Mock Turtle’s Story
9,X. The Lobster Quadrille\n\nThe Mock Turtle s...,X. The Lobster Quadrille,X,The Lobster Quadrille


In [7]:
def remove_Chapter_Title(text):
    length = len(text.split("\n")[0]) + 2
    return text[length:]

In [8]:
chapters['chapter'] = chapters.chapter.map(remove_Chapter_Title)
chapters.drop('Chapter_Title', inplace=True, axis=1)
chapters

Unnamed: 0,chapter,Chapter_Num,Title
0,Alice was beginning to get very tired of sitti...,I,Down the Rabbit-Hole
1,‘Curiouser and curiouser!’ cried Alice (she wa...,II,The Pool of Tears
2,They were indeed a queer-looking party that as...,III,A Caucus-Race and a Long Tale
3,"It was the White Rabbit, trotting slowly back ...",IV,The Rabbit Sends in a Little Bill
4,The Caterpillar and Alice looked at each other...,V,Advice from a Caterpillar
5,For a minute or two she stood looking at the h...,VI,Pig and Pepper
6,There was a table set out under a tree in fron...,VII,A Mad Tea-Party
7,A large rose-tree stood near the entrance of t...,VIII,The Queen’s Croquet-Ground
8,‘You can’t think how glad I am to see you agai...,IX,The Mock Turtle’s Story
9,"The Mock Turtle sighed deeply, and drew the ba...",X,The Lobster Quadrille


In [9]:
# Unnest function
def unnest_tokens(dataframe, column, tokenizer, new_column='word'): 
    """
    Applies a tokenizer to a column and then unnest the dataframe
    
    dataframe: dataframe with text
    column: name of the column with the text
    new_column: what you want the column of words to be called
    tokenizer: a function to be used to tokenize the text
    """ 
    new_column_num = new_column + "_num"
    df = ( dataframe[column]
              .apply(tokenizer)
              .apply(pd.Series)
              .stack()
              .reset_index(level=1)
              .rename(columns={0: new_column, "level_1":new_column_num})
              .join(dataframe.drop(column, axis=1), how='left')
              .reset_index(drop=True)
         )
    df[new_column_num] = pd.to_numeric(df[new_column_num])
    df[new_column_num] = df[new_column_num] + 1
    return df

# Get Paragraphs

In [10]:
def paragraph_tokenizer(text):
    return text.split("\n\n")
paragraphs = unnest_tokens(chapters, "chapter", paragraph_tokenizer, "paragraph")
paragraphs.head()

Unnamed: 0,paragraph_num,paragraph,Chapter_Num,Title
0,1,Alice was beginning to get very tired of sitti...,I,Down the Rabbit-Hole
1,2,So she was considering in her own mind (as wel...,I,Down the Rabbit-Hole
2,3,There was nothing so VERY remarkable in that; ...,I,Down the Rabbit-Hole
3,4,"In another moment down went Alice after it, ne...",I,Down the Rabbit-Hole
4,5,The rabbit-hole went straight on like a tunnel...,I,Down the Rabbit-Hole


In [11]:

paragraphs.paragraph[0]

'Alice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, ‘and what is the use of a book,’ thought Alice ‘without pictures or\nconversations?’'

In [46]:
# Remove linebreaks within a paragraph
paragraphs['paragraph'] = paragraphs.paragraph.str.replace("\n", " ")
paragraphs['paragraph'] = paragraphs.paragraph.str.replace(",", " ")
paragraphs['paragraph'] = paragraphs.paragraph.str.replace("!", " ")
paragraphs['paragraph'] = paragraphs.paragraph.str.replace(";", " ")

In [47]:

paragraphs.paragraph[0]

'Alice was beginning to get very tired of sitting by her sister on the bank  and of having nothing to do: once or twice she had peeped into the book her sister was reading  but it had no pictures or conversations in it  ‘and what is the use of a book ’ thought Alice ‘without pictures or conversations?’'

In [48]:
#get sentences
import spacy
nlp = spacy.load('en')

# Get Sentences

In [15]:
def sentence_tokenizer(text):
    doc = nlp(text)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

In [49]:
sentences = unnest_tokens(paragraphs, "paragraph", sentence_tokenizer, "sentence")
sentences.head()

Unnamed: 0,sentence_num,sentence,paragraph_num,Chapter_Num,Title
0,1,Alice was beginning to get very tired of sitti...,1,I,Down the Rabbit-Hole
1,1,So she was considering in her own mind (as wel...,2,I,Down the Rabbit-Hole
2,1,There was nothing so VERY remarkable in that ...,3,I,Down the Rabbit-Hole
3,2,Oh dear,3,I,Down the Rabbit-Hole
4,3,Oh dear,3,I,Down the Rabbit-Hole


In [50]:

sentences.shape

(2013, 5)

In [51]:
sentences.sentence[0]

'Alice was beginning to get very tired of sitting by her sister on the bank  and of having nothing to do: once or twice she had peeped into the book her sister was reading  but it had no pictures or conversations in it  ‘and what is the use of a book ’ thought Alice ‘without pictures or conversations?’'

In [52]:
test = sentences.sentence[0]
doc = nlp(test)

# Get Sentences

In [53]:

def word_tokenizer(text):
    doc = nlp(text)
    words = [[token.text, token.lemma_, token.pos_, token.is_stop] for token in doc]
    return words

In [54]:
words = unnest_tokens(sentences, "sentence", word_tokenizer, "word")


In [55]:

words.head()

Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title
0,1,"[Alice, alice, PROPN, False]",1,1,I,Down the Rabbit-Hole
1,2,"[was, be, VERB, True]",1,1,I,Down the Rabbit-Hole
2,3,"[beginning, begin, VERB, False]",1,1,I,Down the Rabbit-Hole
3,4,"[to, to, PART, True]",1,1,I,Down the Rabbit-Hole
4,5,"[get, get, VERB, True]",1,1,I,Down the Rabbit-Hole


In [56]:
words[['word', 'lemma', 'POS', "stop"]] = pd.DataFrame(words.word.values.tolist())

In [57]:
words.tail()

Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title,lemma,POS,stop
32724,115,summer,1,71,XII,Alice’s Evidence,summer,NOUN,False
32725,116,days,1,71,XII,Alice’s Evidence,day,NOUN,False
32726,117,.,1,71,XII,Alice’s Evidence,.,PUNCT,False
32727,1,THE,1,72,XII,Alice’s Evidence,the,DET,False
32728,2,END,1,72,XII,Alice’s Evidence,end,NOUN,False


# Finding Puns


In [26]:
from metaphone import doublemetaphone


In [28]:
def metaphone(name):
        return doublemetaphone(name)[0]

In [58]:
words['word'] = words['word'].astype(str)
words['meta_word'] = words['word'].apply(metaphone)
words.tail()

Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title,lemma,POS,stop,meta_word
32724,115,summer,1,71,XII,Alice’s Evidence,summer,NOUN,False,SMR
32725,116,days,1,71,XII,Alice’s Evidence,day,NOUN,False,TS
32726,117,.,1,71,XII,Alice’s Evidence,.,PUNCT,False,
32727,1,THE,1,72,XII,Alice’s Evidence,the,DET,False,0
32728,2,END,1,72,XII,Alice’s Evidence,end,NOUN,False,ANT


In [59]:
# To find homophones in same sentence 
dfpun = pd.DataFrame()
pun= 0
for t in range(0,32341,1):
    i = t
    j = t+1
    while True:
        if((words.loc[i]['meta_word'] == words.loc[j]['meta_word'] and words.loc[i]['word'] != words.loc[j]['word'])):
            punny=words.loc[i]
            dfpun = dfpun.append(punny)
            break
        else:
            j=j+1
        if(j > i+10 or j == 32342):
            break
        
dfpun

Unnamed: 0,Chapter_Num,POS,Title,lemma,meta_word,paragraph_num,sentence_num,stop,word,word_num
15,I,SPACE,Down the Rabbit-Hole,,,1.0,1.0,0.0,,16.0
20,I,PART,Down the Rabbit-Hole,to,T,1.0,1.0,1.0,to,21.0
45,I,PRON,Down the Rabbit-Hole,-PRON-,AT,1.0,1.0,1.0,it,46.0
46,I,SPACE,Down the Rabbit-Hole,,,1.0,1.0,0.0,,47.0
47,I,PRON,Down the Rabbit-Hole,‘,,1.0,1.0,0.0,‘,48.0
50,I,VERB,Down the Rabbit-Hole,be,AS,1.0,1.0,1.0,is,51.0
56,I,PUNCT,Down the Rabbit-Hole,',,1.0,1.0,0.0,’,57.0
66,I,VERB,Down the Rabbit-Hole,be,AS,2.0,1.0,1.0,was,3.0
68,I,ADP,Down the Rabbit-Hole,in,AN,2.0,1.0,1.0,in,5.0
72,I,PUNCT,Down the Rabbit-Hole,(,,2.0,1.0,0.0,(,9.0


In [60]:
dfpun= dfpun.reset_index(drop=True)

In [61]:
dfpun

Unnamed: 0,Chapter_Num,POS,Title,lemma,meta_word,paragraph_num,sentence_num,stop,word,word_num
0,I,SPACE,Down the Rabbit-Hole,,,1.0,1.0,0.0,,16.0
1,I,PART,Down the Rabbit-Hole,to,T,1.0,1.0,1.0,to,21.0
2,I,PRON,Down the Rabbit-Hole,-PRON-,AT,1.0,1.0,1.0,it,46.0
3,I,SPACE,Down the Rabbit-Hole,,,1.0,1.0,0.0,,47.0
4,I,PRON,Down the Rabbit-Hole,‘,,1.0,1.0,0.0,‘,48.0
5,I,VERB,Down the Rabbit-Hole,be,AS,1.0,1.0,1.0,is,51.0
6,I,PUNCT,Down the Rabbit-Hole,',,1.0,1.0,0.0,’,57.0
7,I,VERB,Down the Rabbit-Hole,be,AS,2.0,1.0,1.0,was,3.0
8,I,ADP,Down the Rabbit-Hole,in,AN,2.0,1.0,1.0,in,5.0
9,I,PUNCT,Down the Rabbit-Hole,(,,2.0,1.0,0.0,(,9.0


In [62]:
dfpunnew = dfpun.drop(dfpun[dfpun.POS == "PUNCT"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "SPACE"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "DET"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "CCONJ"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "NUM"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "PRON"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "ADP"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.POS == "PART"].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.stop == True].index)
dfpunnew = dfpunnew.drop(dfpunnew[dfpunnew.lemma == "-PRON-"].index)
dfpunnew

Unnamed: 0,Chapter_Num,POS,Title,lemma,meta_word,paragraph_num,sentence_num,stop,word,word_num
13,I,NOUN,Down the Rabbit-Hole,eye,AS,2.0,1.0,0.0,eyes,58.0
16,I,NOUN,Down the Rabbit-Hole,way,A,3.0,1.0,0.0,way,21.0
19,I,INTJ,Down the Rabbit-Hole,oh,A,3.0,2.0,0.0,Oh,1.0
20,I,INTJ,Down the Rabbit-Hole,oh,A,3.0,3.0,0.0,Oh,1.0
23,I,VERB,Down the Rabbit-Hole,think,0T,3.0,6.0,0.0,thought,4.0
26,I,VERB,Down the Rabbit-Hole,ought,AT,3.0,6.0,0.0,ought,15.0
40,I,VERB,Down the Rabbit-Hole,go,ANT,5.0,1.0,0.0,went,5.0
45,I,VERB,Down the Rabbit-Hole,go,ANT,6.0,1.0,0.0,went,22.0
68,I,ADV,Down the Rabbit-Hole,why,A,7.0,2.0,0.0,Why,1.0
89,I,ADV,Down the Rabbit-Hole,very,FR,8.0,5.0,0.0,VERY,42.0


In [63]:
dfpunnew.groupby('Title')["word"].count()

Title
 A Caucus-Race and a Long Tale        33
 A Mad Tea-Party                      50
 Advice from a Caterpillar            31
 Alice’s Evidence                     23
 Down the Rabbit-Hole                 28
 Pig and Pepper                       39
 The Lobster Quadrille                56
 The Mock Turtle’s Story              38
 The Pool of Tears                    45
 The Queen’s Croquet-Ground           42
 The Rabbit Sends in a Little Bill    54
 Who Stole the Tarts?                 24
Name: word, dtype: int64

# Finding Character mentions

In [35]:
df2=pd.DataFrame({'Alice_count' : words[words['word'] == 'Alice'].groupby( [ "Title"] ).size()}).reset_index()

df2

Unnamed: 0,Title,Alice_count
0,A Caucus-Race and a Long Tale,23
1,A Mad Tea-Party,51
2,Advice from a Caterpillar,35
3,Alice’s Evidence,22
4,Down the Rabbit-Hole,28
5,Pig and Pepper,43
6,The Lobster Quadrille,29
7,The Mock Turtle’s Story,52
8,The Pool of Tears,24
9,The Queen’s Croquet-Ground,39


In [72]:
df3=pd.DataFrame({'Hatter_count' : words[words['word'] == 'Hatter'].groupby( [ "Title"] ).size()}).reset_index()
df4=pd.DataFrame({'WhiteRabbit_count' : words[words['word'] == 'Rabbit'].groupby( [ "Title"] ).size()}).reset_index()
df5=pd.DataFrame({'CheshireCat_count' : words[words['word'] == 'Cheshire'].groupby( [ "Title"] ).size()}).reset_index()
df6=pd.DataFrame({'Caterpillar_count' : words[words['word'] == 'Caterpillar'].groupby( [ "Title"] ).size()}).reset_index()
df7=pd.DataFrame({'RedQueen_count' : words[words['word'] == 'Queen'].groupby( [ "Title"] ).size()}).reset_index()
df8=pd.DataFrame({'King_count' : words[words['word'] == 'King'].groupby( [ "Title"] ).size()}).reset_index()
df9=pd.DataFrame({'Duchess_count' : words[words['word'] == 'Duchess'].groupby( [ "Title"] ).size()}).reset_index()
df10=pd.DataFrame({'Gryphon_count' : words[words['word'] == 'Gryphon'].groupby( [ "Title"] ).size()}).reset_index()
df11=pd.DataFrame({'Hare_count' : words[words['word'] == 'Hare'].groupby( [ "Title"] ).size()}).reset_index()
df12=pd.DataFrame({'Dormouse_count' : words[words['word'] == 'Dormouse'].groupby( [ "Title"] ).size()}).reset_index()
df13=pd.DataFrame({'Sister_count' : words[words['word'] == 'sister'].groupby( [ "Title"] ).size()}).reset_index()
#pd.merge(df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13 how='outer')
df14=pd.DataFrame({'Turtle_count' : words[words['word'] == 'Turtle'].groupby( [ "Title"] ).size()}).reset_index()
df15=pd.DataFrame({'Knave_count' : words[words['word'] == 'Knave'].groupby( [ "Title"] ).size()}).reset_index()



df11

Unnamed: 0,Title,Hare_count
0,A Mad Tea-Party,21
1,Alice’s Evidence,1
2,Pig and Pepper,4
3,Who Stole the Tarts?,5


In [73]:
dfX = pd.merge(df6, df7, how='outer')
dfY= pd.merge(df2, df3, how='outer')
dfZ= pd.merge(df4, df5, how='outer')
dfW = pd.merge(df8, df9, how='outer')
dfU =pd.merge(df10, df11, how='outer')
dfV= pd.merge(df12, df13, how='outer')
dfR= pd.merge(dfY, dfX, how='outer')
dfS= pd.merge(dfZ, dfW, how='outer')
dfT= pd.merge(dfU, dfV, how='outer')
dfB= pd.merge(dfR, dfS, how='outer')
dfA=pd.merge(df14, df15, how='outer')
dfC= pd.merge(dfA, dfT, how='outer')
Character_Mentions= pd.merge(dfB, dfC, how='outer')
Character_Mentions.fillna(0)
Character_Mentions.to_csv("charactermentions.csv")



# Finding made up words

In [64]:
#removing punctuation
wordnew = words.drop(words[words.POS == "PUNCT"].index)
wordnew = wordnew.drop(wordnew[words.POS == "SPACE"].index)
wordnew = wordnew.drop(wordnew[words.POS == "DET"].index)
wordnew = wordnew.drop(wordnew[words.POS == "CCONJ"].index)
wordnew = wordnew.drop(wordnew[words.POS == "NUM"].index)
wordnew = wordnew.drop(wordnew[words.POS == "PRON"].index)
wordnew = wordnew.drop(wordnew[words.POS == "ADP"].index)
wordnew = wordnew.drop(wordnew[words.POS == "PART"].index)
wordnew = wordnew.drop(wordnew[words.stop == True].index)
wordnew.tail()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  
  import sys
  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title,lemma,POS,stop,meta_word
32719,110,life,1,71,XII,Alice’s Evidence,life,NOUN,False,LF
32723,114,happy,1,71,XII,Alice’s Evidence,happy,ADJ,False,HP
32724,115,summer,1,71,XII,Alice’s Evidence,summer,NOUN,False,SMR
32725,116,days,1,71,XII,Alice’s Evidence,day,NOUN,False,TS
32728,2,END,1,72,XII,Alice’s Evidence,end,NOUN,False,ANT


In [65]:
gb =wordnew.groupby('POS')
for name, group, in gb:
       print(name, len(group))

        
words.groupby('Chapter_Num')['paragraph_num'].max()
#words['word'].value_counts()

ADJ 1269
ADV 1037
INTJ 117
NOUN 3460
PROPN 1438
VERB 4124
X 15


Chapter_Num
 I        30
 II       26
 III      48
 IV       42
 IX       92
 V        78
 VI       81
 VII     105
 VIII     71
 X        85
 XI       74
 XII      72
Name: paragraph_num, dtype: int64

In [67]:
import enchant

ModuleNotFoundError: No module named 'enchant'