Import of the libraries

In [None]:
import re
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd

Reading the file

In [None]:
alicetext = open("/content/Alice.txt", 'r').read()

Task 1. \
Data preprocessing

In [None]:
# Removing all irrelevant characters
alice = re.sub("<.*?>", " ", alicetext)
alice = re.sub(" +", " ", alice)
alice = re.sub(r"[^\w\s]", "", alice)
alice = re.sub(r"\d", "", alice)

In [None]:
# Tokenization
tokens = WhitespaceTokenizer().tokenize(alice)[:]

In [None]:
#Lower case
tokens = [token.lower() for token in tokens]

In [None]:
#Removing stop-words
nltk.download('stopwords')
stop_words = stopwords.words("english")
tokens = [token for token in tokens if token not in stop_words]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Lemmatization
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens][:]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Task 2. \
Top 10 most importantwords from each chapter in the text (not "Alice")

In [None]:
my_string = ' '.join([str(elem) for elem in tokens]) 
chapter_texts = []
for chap_n in range(13, 25):
  stroka = my_string.split("chapter",chap_n)[chap_n]
  chapter_texts.append([stroka])

In [None]:
for i, chap in enumerate(chapter_texts):
  tfidf = TfidfVectorizer()
  X = tfidf.fit_transform(chap)
  feature_names = np.array(tfidf.get_feature_names())
  tfidf_sorting = np.argsort(X.toarray()).flatten()[::-1]
  n = 11
  topn = feature_names[tfidf_sorting][:n]
  words = []
  for el in topn:
    if el == 'alice':
      continue
    else:
      words.append(el)
  print('\nIn chapter {} Top-10 most important words are:\n{}'.format(i+1, words))


In chapter 1 Top-10 most important words are:
['queen', 'one', 'would', 'know', 'little', 'like', 'said', 'went', 'thing', 'thought']

In chapter 2 Top-10 most important words are:
['thing', 'queen', 'like', 'went', 'said', 'thought', 'would', 'little', 'one', 'know']

In chapter 3 Top-10 most important words are:
['little', 'thing', 'king', 'one', 'would', 'said', 'went', 'queen', 'like', 'know']

In chapter 4 Top-10 most important words are:
['said', 'turtle', 'would', 'like', 'king', 'went', 'queen', 'one', 'know', 'little']

In chapter 5 Top-10 most important words are:
['turtle', 'little', 'know', 'king', 'one', 'would', 'went', 'said', 'queen', 'hatter']

In chapter 6 Top-10 most important words are:
['said', 'little', 'mock', 'king', 'went', 'one', 'hatter', 'turtle', 'queen', 'know']

In chapter 7 Top-10 most important words are:
['went', 'turtle', 'king', 'know', 'queen', 'said', 'mock', 'one', 'gryphon', 'hatter']

In chapter 8 Top-10 most important words are:
['one', 'queen

Task 3.
Finding the Top 10 most used verbs in sentences with Alice

In [None]:
verbs_csv = pd.read_csv("/content/englishverbs.csv",sep = ';',encoding= 'unicode_escape')

In [None]:
verbs_csv.head()

Unnamed: 0,Num,Base Form,Past Form,Past Participle Form
0,1,abash,abashed,abashed
1,2,abate,abated,abated
2,3,abide,abode,abode
3,4,absorb,absorbed,absorbed
4,5,accept,accepted,accepted


In [None]:
#Creating a list of the verbs
verbs = []
x = verbs_csv['Base Form'].values
y = verbs_csv['Past Form'].values
z = verbs_csv['Past Participle Form'].values
verbs.append(x)
verbs.append(y)
verbs.append(z)
verbs = np.reshape(verbs, (1, -1))[0]

Removing all irrelevant characters

In [None]:
text = re.sub("<*>|\n|\n\n", " ", alicetext)
text = re.sub(" +", " ", text)
text = re.sub(r"[^\w\s\.]", "", text)
text = re.sub(r"\d", "", text)

In [None]:
#Tokenization
tokens = re.split(r'[.|;|?|!|"]', text)
#Lower case
tokens = [token.lower() for token in tokens]

Finding all the senteces with the name Alice

In [None]:
verbs_freq = dict()
verbs = list(verbs)
for sent in tokens:
  x = sent.find('alice')
  if 'alice' in sent:
    sentence = WhitespaceTokenizer().tokenize(sent)[:]
    # Update the frequency of the verbs
    for w in sentence:
      if w in verbs and len(verbs[verbs.index(w)]) == len(w):
        if verbs_freq.get(w) is not None:
          verbs_freq[w] += 1
        else:
          verbs_freq[w] = 1

Printing the results

In [None]:
top10 = dict(sorted(verbs_freq.items(), key=lambda item: item[1], reverse=True)[:10])
print('Top-10 verbs in sentences with Alice:')
print(top10, '\n')

Top-10 verbs in sentences with Alice:
{'said': 209, 'was': 184, 'had': 96, 'be': 81, 'thought': 61, 'like': 53, 'do': 47, 'went': 43, 'know': 41, 'see': 35} 



As we can see, the most frequent Alice actions are:\
To say\
To be\
To think\
To like\
To do\
To go\
To know\
To be able to(can)\
Would\
To see