# Down the Rabbit hole
Text analysis of Lewis Carrol aka Charles Dogsons "Alice in Wonderland" and "Through the Looking Glass".


<img src ="../Down-the-rabbit-hole/image.png"/> 

In [1]:
import numpy as np
import pandas as pd


In [46]:
with open("../Down-the-rabbit-hole/11-0.txt", encoding='utf8') as f:
    file= f.read()

In [47]:
raw = file.split("CHAPTER")[1:]


In [48]:
raw = pd.DataFrame({"chapter": raw})
raw.head()

Unnamed: 0,chapter
0,I. Down the Rabbit-Hole\n\nAlice was beginnin...
1,II. The Pool of Tears\n\n‘Curiouser and curio...
2,III. A Caucus-Race and a Long Tale\n\nThey we...
3,IV. The Rabbit Sends in a Little Bill\n\nIt w...
4,V. Advice from a Caterpillar\n\nThe Caterpill...


In [49]:
chapters = raw.copy()

# Get the chapter title, number, text
chapters['Chapter_Title'] = chapters.chapter.str.split("\n").str[0]
chapters['Chapter_Num'] = chapters.Chapter_Title.str.split(".").str[0]
chapters['Title'] = chapters.Chapter_Title.str.split(".").str[1]
chapters

Unnamed: 0,chapter,Chapter_Title,Chapter_Num,Title
0,I. Down the Rabbit-Hole\n\nAlice was beginnin...,I. Down the Rabbit-Hole,I,Down the Rabbit-Hole
1,II. The Pool of Tears\n\n‘Curiouser and curio...,II. The Pool of Tears,II,The Pool of Tears
2,III. A Caucus-Race and a Long Tale\n\nThey we...,III. A Caucus-Race and a Long Tale,III,A Caucus-Race and a Long Tale
3,IV. The Rabbit Sends in a Little Bill\n\nIt w...,IV. The Rabbit Sends in a Little Bill,IV,The Rabbit Sends in a Little Bill
4,V. Advice from a Caterpillar\n\nThe Caterpill...,V. Advice from a Caterpillar,V,Advice from a Caterpillar
5,VI. Pig and Pepper\n\nFor a minute or two she...,VI. Pig and Pepper,VI,Pig and Pepper
6,VII. A Mad Tea-Party\n\nThere was a table set...,VII. A Mad Tea-Party,VII,A Mad Tea-Party
7,VIII. The Queen’s Croquet-Ground\n\nA large r...,VIII. The Queen’s Croquet-Ground,VIII,The Queen’s Croquet-Ground
8,IX. The Mock Turtle’s Story\n\n‘You can’t thi...,IX. The Mock Turtle’s Story,IX,The Mock Turtle’s Story
9,X. The Lobster Quadrille\n\nThe Mock Turtle s...,X. The Lobster Quadrille,X,The Lobster Quadrille


In [50]:
def remove_Chapter_Title(text):
    length = len(text.split("\n")[0]) + 2
    return text[length:]

In [51]:
chapters['chapter'] = chapters.chapter.map(remove_Chapter_Title)
chapters.drop('Chapter_Title', inplace=True, axis=1)
chapters

Unnamed: 0,chapter,Chapter_Num,Title
0,Alice was beginning to get very tired of sitti...,I,Down the Rabbit-Hole
1,‘Curiouser and curiouser!’ cried Alice (she wa...,II,The Pool of Tears
2,They were indeed a queer-looking party that as...,III,A Caucus-Race and a Long Tale
3,"It was the White Rabbit, trotting slowly back ...",IV,The Rabbit Sends in a Little Bill
4,The Caterpillar and Alice looked at each other...,V,Advice from a Caterpillar
5,For a minute or two she stood looking at the h...,VI,Pig and Pepper
6,There was a table set out under a tree in fron...,VII,A Mad Tea-Party
7,A large rose-tree stood near the entrance of t...,VIII,The Queen’s Croquet-Ground
8,‘You can’t think how glad I am to see you agai...,IX,The Mock Turtle’s Story
9,"The Mock Turtle sighed deeply, and drew the ba...",X,The Lobster Quadrille


In [28]:
# Unnest function
def unnest_tokens(dataframe, column, tokenizer, new_column='word'): 
    """
    Applies a tokenizer to a column and then unnest the dataframe
    
    dataframe: dataframe with text
    column: name of the column with the text
    new_column: what you want the column of words to be called
    tokenizer: a function to be used to tokenize the text
    """ 
    new_column_num = new_column + "_num"
    df = ( dataframe[column]
              .apply(tokenizer)
              .apply(pd.Series)
              .stack()
              .reset_index(level=1)
              .rename(columns={0: new_column, "level_1":new_column_num})
              .join(dataframe.drop(column, axis=1), how='left')
              .reset_index(drop=True)
         )
    df[new_column_num] = pd.to_numeric(df[new_column_num])
    df[new_column_num] = df[new_column_num] + 1
    return df

In [52]:
def paragraph_tokenizer(text):
    return text.split("\n\n")
paragraphs = unnest_tokens(chapters, "chapter", paragraph_tokenizer, "paragraph")
paragraphs.head()

Unnamed: 0,paragraph_num,paragraph,Chapter_Num,Title
0,1,Alice was beginning to get very tired of sitti...,I,Down the Rabbit-Hole
1,2,So she was considering in her own mind (as wel...,I,Down the Rabbit-Hole
2,3,There was nothing so VERY remarkable in that; ...,I,Down the Rabbit-Hole
3,4,"In another moment down went Alice after it, ne...",I,Down the Rabbit-Hole
4,5,The rabbit-hole went straight on like a tunnel...,I,Down the Rabbit-Hole


In [53]:

paragraphs.paragraph[0]

'Alice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, ‘and what is the use of a book,’ thought Alice ‘without pictures or\nconversations?’'

In [54]:
# Remove linebreaks within a paragraph
paragraphs['paragraph'] = paragraphs.paragraph.str.replace("\n", " ")

In [33]:

paragraphs.paragraph[0]

'Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, ‘and what is the use of a book,’ thought Alice ‘without pictures or conversations?’'

In [55]:
#get sentences
import spacy
nlp = spacy.load('en')

In [56]:
def sentence_tokenizer(text):
    doc = nlp(text)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences

In [57]:
sentences = unnest_tokens(paragraphs, "paragraph", sentence_tokenizer, "sentence")
sentences.head()

Unnamed: 0,sentence_num,sentence,paragraph_num,Chapter_Num,Title
0,1,Alice was beginning to get very tired of sitti...,1,I,Down the Rabbit-Hole
1,1,So she was considering in her own mind (as wel...,2,I,Down the Rabbit-Hole
2,1,There was nothing so VERY remarkable in that; ...,3,I,Down the Rabbit-Hole
3,2,Oh dear!,3,I,Down the Rabbit-Hole
4,3,Oh dear!,3,I,Down the Rabbit-Hole


In [58]:

sentences.shape

(1680, 5)

In [59]:
sentences.sentence[0]

'Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, ‘and what is the use of a book,’ thought Alice ‘without pictures or conversations?’'

In [60]:
test = sentences.sentence[0]
doc = nlp(test)

In [61]:

def word_tokenizer(text):
    doc = nlp(text)
    words = [[token.text, token.lemma_, token.pos_, token.is_stop] for token in doc]
    return words

In [62]:
words = unnest_tokens(sentences, "sentence", word_tokenizer, "word")


In [63]:

words.head()

Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title
0,1,"[Alice, alice, PROPN, False]",1,1,I,Down the Rabbit-Hole
1,2,"[was, be, VERB, True]",1,1,I,Down the Rabbit-Hole
2,3,"[beginning, begin, VERB, False]",1,1,I,Down the Rabbit-Hole
3,4,"[to, to, PART, True]",1,1,I,Down the Rabbit-Hole
4,5,"[get, get, VERB, True]",1,1,I,Down the Rabbit-Hole


In [64]:
words[['word', 'lemma', 'POS', "stop"]] = pd.DataFrame(words.word.values.tolist())

In [65]:
words.tail()

Unnamed: 0,word_num,word,sentence_num,paragraph_num,Chapter_Num,Title,lemma,POS,stop
32338,115,summer,1,71,XII,Alice’s Evidence,summer,NOUN,False
32339,116,days,1,71,XII,Alice’s Evidence,day,NOUN,False
32340,117,.,1,71,XII,Alice’s Evidence,.,PUNCT,False
32341,1,THE,1,72,XII,Alice’s Evidence,the,DET,False
32342,2,END,1,72,XII,Alice’s Evidence,end,NOUN,False
