Task 4:
1.	Download Alice in Wonderland by Lewis Carroll from Project Gutenberg's website http://www.gutenberg.org/files/11/11-0.txt
2.	Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.
3.	Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?
4.	Find the Top 10 most used verbs in sentences with Alice. What does Alice do most often?

# Part 1

In [54]:
import requests
import re
import pandas as pd

In [55]:
# Downloading and saving the text from the website
source = requests.get('http://www.gutenberg.org/files/11/11-0.txt')
text = source.content

# Part 2. Preprocessing

In [56]:
type(text)

bytes

In [57]:
text



In [58]:
# Decoding from bytes object to a string
text = text.decode('UTF-8')
text



In [59]:
# Removing all irrelevant characters
text = re.sub(r'[\t\n\r*]', ' ', text)
text = re.sub(r'\[.*?\]', '', text) # objects in brackets []
text = re.sub(r'[_\\]', '', text)

# Removing irrelevant text
text = re.sub(r'^.*?CHAPTER XII.   Alice’s Evidence', '', text) # deleting site information and the contents from the start
text = re.sub(r'END OF THE PROJECT GUTENBERG EBOOK.*?$', '', text) # deleting technical information after the end

In [60]:
# Removing odd spaces

text = re.sub(r' +', ' ', text)

In [61]:
#Deviding into chapters
chapters = re.findall(r'CHAPTER \w+', text)
text_dict = {}

In [62]:
#Deviding into chapters
chapters = re.findall(r'CHAPTER \w+', text) # saving all the chapters into an array
text_dict = {}

for chapter in chapters[:-1]:
    text_dict[chapter] = re.findall(r'{}\.+.*?CHAPTER'.format(chapter), text) #parsing from one chapter start to the next chapter start
    text_dict[chapter] = re.sub(r'CHAPTER \w+\.', '', text_dict[chapter][0]) #deleting the CHAPTER sign at the beginning
    text_dict[chapter] = re.sub(r'CHAPTER', '', text_dict[chapter]) #deleting the CHAPTER sign at the end

#parsing the last chapter from the beginning of the chapter to the end
text_dict[chapters[-1]] = re.findall(r'{}\.+.*?$'.format(chapters[-1]), text)
text_dict[chapters[-1]] = re.sub(r'CHAPTER \w+\.', '', text_dict[chapters[-1]][0])
text_dict[chapters[-1]] = re.sub(r'THE END.*?$', '', text_dict[chapters[-1]])

text_dict

{'CHAPTER I': ' Down the Rabbit-Hole Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, “and what is the use of a book,” thought Alice “without pictures or conversations?” So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, “Oh dear! Oh dear! I shall be late!” (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waist

In [63]:
from nltk.tokenize import WhitespaceTokenizer
tokens_dict = {}

for chapter in chapters:
    #removing all symbols except letters and digits
    tokens_dict[chapter] = re.sub(r'[\.,!\?"—\(\);“”:]', ' ', text_dict[chapter]) #except apostrophe (')
    #removing the odd spaces
    tokens_dict[chapter] = re.sub(r' +', ' ', tokens_dict[chapter])
    tokens_dict[chapter] = re.sub(r'^ ', '', tokens_dict[chapter])
    #splitting into tokens
    tokens_dict[chapter] = WhitespaceTokenizer().tokenize(tokens_dict[chapter])

In [64]:
# Lower case

for chapter in chapters:
    tokens_dict[chapter] = [token.lower() for token in tokens_dict[chapter]]

In [65]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [66]:
# Lemmatization

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
for chapter in chapters:
    tokens_dict[chapter] = [lemmatizer.lemmatize(token) for token in tokens_dict[chapter]]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [67]:
# Removing stop words

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Adding some other stopwords that were not included in a downloaded array
stop_words.append('would')
stop_words.append('shall')
stop_words.append('whether')

for chapter in chapters:
    tokens_dict[chapter] = [token for token in tokens_dict[chapter] if token not in stop_words]
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Part 3

In [68]:
# Introducing tf-idf
from math import log
docs = tokens_dict.values()

def tf(t, d):
    n_t = d.count(t)
    res = n_t/len(d)
    return res

def idf(t, D):
    counter = 0
    for doc in D:
        if t in doc:
            counter+=1
    res = log(len(D)/counter)
    return res

def tf_idf(t, d, D):
    res = tf(t, d) * idf(t, D)
    return res

In [69]:
def top_10(doc):
    words = set(doc)
    words_rate = {}
    for word in words:
        words_rate[word] = tf_idf(word, doc, docs)
    df = pd.DataFrame.from_dict(words_rate, columns = ['tf_idf'], orient='index')
    df = df.sort_values(by='tf_idf', ascending=False)
    return df.head(10)

In [70]:
# Showing the tops of all the chapters

for chapter in chapters:
    print('Top-10 in ', chapter)
    print('\n')
    print(top_10(tokens_dict[chapter]))
    print('\n')

Top-10 in  CHAPTER I


             tf_idf
bat        0.010529
key        0.008147
candle     0.007301
dark       0.007301
bottle     0.007020
eat        0.006002
fell       0.005431
marked     0.005431
cake       0.005265
telescope  0.005265


Top-10 in  CHAPTER II


         tf_idf
mouse  0.017815
swam   0.012039
pool   0.010746
mabel  0.009631
glove  0.006945
fan    0.006945
dog    0.006945
cat    0.006787
four   0.005373
kid    0.005209


Top-10 in  CHAPTER III


               tf_idf
dodo         0.025325
mouse        0.020624
prize        0.017561
lory         0.012663
dry          0.012663
thimble      0.011707
caucus-race  0.008781
tale         0.006331
bird         0.006187
mercia       0.005854


Top-10 in  CHAPTER IV


           tf_idf
window   0.015702
puppy    0.011777
glove    0.009907
bill     0.008678
chimney  0.008492
bottle   0.008492
fan      0.008492
ann      0.007851
yer      0.007851
honour   0.007851


Top-10 in  CHAPTER V


               tf_idf
caterpillar  0.

# Part 4

In [71]:
# Splitting into sentences
sentences = re.split(r'[\.\?!]', text)

In [72]:
# Removing sentences without Alice

sentences = [sent for sent in sentences if 'Alice' in sent]

In [76]:
# Splitting sentences into words
sent_of_tokens = []

for i in range(len(sentences)):
    # Tokenization
    #removing all symbols except letters and digits
    sent = re.sub(r'[\.,!\?"—\(\);“”:]', ' ', sentences[i]) #except apostrophe (')
    #removing the odd spaces
    sent = re.sub(r' +', ' ', sent)
    sent = re.sub(r'^ ', '', sent)

    #splitting into words
    sent = sent.split(' ')

    # Lower case
    sent = [token.lower() for token in sent]

    # Lemmatization
    sent = [lemmatizer.lemmatize(token) for token in sent]

    # Removing stop words
    sent = [token for token in sent if token not in stop_words]
    sent_of_tokens.append(sent)

In [77]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [79]:
# Saving only the verbs
alice_verbs = []

for i in range(len(sent_of_tokens)):
    verbs = [lemmatizer.lemmatize(token[0], 'v') for token in nltk.pos_tag(sent_of_tokens[i]) if token[1][0] == 'V']
    for verb in verbs: # приведение в начальную форму
        alice_verbs.append(verb)

In [80]:
# Printing the top-10 verbs

unique_verbs = set(alice_verbs)
verbs_count = {}
for verb in unique_verbs:
    verbs_count[verb] = alice_verbs.count(verb)

df = pd.DataFrame.from_dict(verbs_count, columns = ['count'], orient='index')
df = df.sort_values(by='count', ascending = False)
print("Top-10 verbs used with Alice:\n")
df.head(10)

Top-10 verbs used with Alice:



Unnamed: 0,count
say,198
go,65
think,58
get,41
look,40
begin,31
know,26
come,26
see,25
find,21
