In [57]:
'''Packages'''
import re
import codecs
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
import numpy as np
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from heapq import nlargest
from operator import itemgetter
from collections import Counter
from nltk import tokenize
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
'''Download the data'''
fileObj = codecs.open( "text.txt", "r", "utf_8_sig" )
text = fileObj.read()
fileObj.close()

text = re.sub("\r", " ", text)
text = re.sub("\n", " ", text)
text = re.sub("_", " ", text)
text = re.sub("\. ", " . ", text)

#clean the beggining and the end
text =  text[text.find("CHAPTER I .",text.find("CHAPTER I .")+1):text.find("THE END")]


In [3]:
'''2nd point'''

#to lower
text = text.lower()

#split on chapters
chapters =  [m.start() for m in re.finditer('chapter', text)]
print("There are ",len(chapters), " chapters")

''' 
Create df with text splited on chapters. The text is cleaned and punctuation
marks as .!? are saved in order to control the end of the sentece
''' 
text_chapters = pd.DataFrame()
text_chapters['Chapter'] = range(1,len(chapters)+1)
list_for_text = []
for i in range(len(chapters)):
    if i == len(chapters)-1:
        #split on chapters
        subtext = text[chapters[i]:]
        #Delete non-alphabetical symbols
        subtext = re.sub('[^A-Za-z0-9.!?\-]+', ' ', subtext)
        #Delete "chapter" heading
        subtext = re.sub("chapter", "", subtext)
        #Delete roman numbers of chapters
        subtext = subtext[subtext.find(" ",1):]
        list_for_text.append(subtext)
        break
    #split on chapters
    subtext = text[chapters[i]:chapters[i+1]]
    #Delete non-alphabetical symbols
    subtext = re.sub('[^A-Za-z0-9.!?\-]+', ' ', subtext)
    #Delete "chapter" heading
    subtext = re.sub("chapter", "", subtext)
    #Delete roman numbers of chapters and first dot
    subtext = subtext[subtext.find(" ",1):]
    subtext = subtext[subtext.find(" ",1):]
    list_for_text.append(subtext)

text_chapters["text"] = list_for_text

text_chapters.head()


There are  12  chapters


Unnamed: 0,Chapter,text
0,1,down the rabbit-hole alice was beginning to g...
1,2,the pool of tears curiouser and curiouser! cr...
2,3,a caucus-race and a long tale they were indee...
3,4,the rabbit sends in a little bill it was the ...
4,5,advice from a caterpillar the caterpillar and...


In [4]:
'''Tokenization ,lemmatization and deleting stop-words'''

tokens = TreebankWordTokenizer().tokenize(text_chapters["text"][0])


#tokenizer
array_for_tokens = []
for i in range(len(text_chapters)):
    array_for_tokens.append(TreebankWordTokenizer().tokenize(text_chapters["text"][i]))


text_chapters['tokens'] = array_for_tokens

#stopwords 
from nltk.corpus import stopwords
stop_words = stopwords.words("english")


lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


clean = []
for i in range(len(text_chapters)):
    #deleting stopwords
    tokens_ns = [token for token in text_chapters["tokens"][i] if token not in stop_words]

    #lemmatization
    clean.append([lemmatizer.lemmatize(token) for token in tokens_ns])

text_chapters['tokens_ns'] = clean

text_chapters.head()

Unnamed: 0,Chapter,text,tokens,tokens_ns
0,1,down the rabbit-hole alice was beginning to g...,"[down, the, rabbit-hole, alice, was, beginning...","[rabbit-hole, alice, beginning, get, tired, si..."
1,2,the pool of tears curiouser and curiouser! cr...,"[the, pool, of, tears, curiouser, and, curious...","[pool, tear, curiouser, curiouser, !, cried, a..."
2,3,a caucus-race and a long tale they were indee...,"[a, caucus-race, and, a, long, tale, they, wer...","[caucus-race, long, tale, indeed, queer-lookin..."
3,4,the rabbit sends in a little bill it was the ...,"[the, rabbit, sends, in, a, little, bill, it, ...","[rabbit, sends, little, bill, white, rabbit, t..."
4,5,advice from a caterpillar the caterpillar and...,"[advice, from, a, caterpillar, the, caterpilla...","[advice, caterpillar, caterpillar, alice, look..."


In [5]:
'''3rd point - TF-IDF'''

TF_IDF_list = []
for chapt in range(0,len(text_chapters['tokens_ns'])):
    #delete .!? in order not to capture this in TF-IDF calculations
    
    tokens_alice_nopunct = [token for token in text_chapters['tokens_ns'][chapt] if token not in ["!",".","?"]]

    #store particular chapter length for TF calculation
    len_chapter = len(tokens_alice_nopunct)
    #Count number of instances of each unique word
    counted =  Counter(tokens_alice_nopunct)
    df_counted = pd.DataFrame.from_dict(counted, orient='index').reset_index()
    df_counted = df_counted.rename(columns={'index':'word', 0:'count'})
    #Calculate TF
    df_counted["TF"] = df_counted["count"]/len_chapter
    #Calculate for each word in which chapter it is present
    temp_list = []
    for word in df_counted["word"]:
        idf = 0
        for chapter in text_chapters['tokens_ns']:
            if word in chapter:
                idf+= 1
        #Calculate IDF
        temp_list.append(np.log(len(text_chapters['tokens_ns'])/idf))

    df_counted["IDF"] = temp_list
    #Calculate TF-IDF
    df_counted["TF-IDF"] = df_counted["IDF"] * df_counted["TF"]

    #Store dict with key - unique word, value - TF-IDF for each chapter
    zip_iterator = zip(df_counted['word'], df_counted["TF-IDF"])
    a_dictionary = dict(zip_iterator)

    TF_IDF_list.append(a_dictionary)
text_chapters["TF-IDF"] = TF_IDF_list

text_chapters.head()


Unnamed: 0,Chapter,text,tokens,tokens_ns,TF-IDF
0,1,down the rabbit-hole alice was beginning to g...,"[down, the, rabbit-hole, alice, was, beginning...","[rabbit-hole, alice, beginning, get, tired, si...","{'rabbit-hole': 0.005616800843975094, 'alice':..."
1,2,the pool of tears curiouser and curiouser! cr...,"[the, pool, of, tears, curiouser, and, curious...","[pool, tear, curiouser, curiouser, !, cried, a...","{'pool': 0.011698686591729034, 'tear': 0.00554..."
2,3,a caucus-race and a long tale they were indee...,"[a, caucus-race, and, a, long, tale, they, wer...","[caucus-race, long, tale, indeed, queer-lookin...","{'caucus-race': 0.006189057658251558, 'long': ..."
3,4,the rabbit sends in a little bill it was the ...,"[the, rabbit, sends, in, a, little, bill, it, ...","[rabbit, sends, little, bill, white, rabbit, t...","{'rabbit': 0.006874955366488354, 'sends': 0.00..."
4,5,advice from a caterpillar the caterpillar and...,"[advice, from, a, caterpillar, the, caterpilla...","[advice, caterpillar, caterpillar, alice, look...","{'advice': 0.0018395887774415348, 'caterpillar..."


In [6]:
'''Top 10 TF-IDF words for each chapter'''
for i in range(0,len(text_chapters['tokens_ns'])):
    #print(dict(sorted(text_chapters["TF-IDF"][i].items(), key = itemgetter(1), reverse = True)[:10]))
    print("Chapter ",i+1," ",nlargest(10, text_chapters["TF-IDF"][i], key = text_chapters["TF-IDF"][i].get))
    print("")

Chapter  1   ['bat', 'key', 'dark', 'poison', 'candle', 'bottle', 'eat', 'fell', 'marked', 'dinah']

Chapter  2   ['mouse', 'swam', 'pool', 'mabel', 'glove', 'fan', 'cat', 'four', 'kid', 'capital']

Chapter  3   ['dodo', 'mouse', 'prize', 'lory', 'thimble', 'dry', 'dinah', 'tale', 'bird', 'caucus-race']

Chapter  4   ['window', 'puppy', 'bill', 'glove', 'fan', 'bottle', 'chimney', 'mary', 'ann', 'yer']

Chapter  5   ['caterpillar', 'serpent', 'pigeon', 'youth', 'egg', 'father', 'size', 'hookah', 'green', 'taller']

Chapter  6   ['footman', 'baby', 'cat', 'pig', 'wow', 'mad', 'grunted', 'cook', 'duchess', 'livery']

Chapter  7   ['dormouse', 'hatter', 'march', 'hare', 'twinkle', 'clock', 'draw', 'tea', 'asleep', 'civil']

Chapter  8   ['queen', 'hedgehog', 'gardener', 'king', 'procession', 'executioner', 'five', 'soldier', 'rose-tree', 'cat']

Chapter  9   ['turtle', 'mock', 'moral', 'gryphon', 'duchess', 'queen', 'tortoise', 'school', 'chin', 'ti']

Chapter  10   ['turtle', 'mock', 'lo

According to the most important words in each chapter I will assign next namings
$\circ$ Chapter 1 - "Dark rabbit-hole" 

$\circ$ Chapter 2 - "Cat and mouse"

$\circ$ Chapter 3 - "Dodo pizza"

$\circ$ Chapter 4 - "Home"

$\circ$ Chapter 5 - "Hookah lounge"

$\circ$ Chapter 6 - "mad creatures"

$\circ$ Chapter 7 - "Hatter"

$\circ$ Chapter 8 - "King palace"

$\circ$ Chapter 9 - "Moral turtle"

$\circ$ Chapter 10 - "Sea creatures"

$\circ$ Chapter 11 - "102"

$\circ$ Chapter 12 - "Competiton"



In [58]:
'''4th point - top 10 words in sentences with alice'''
full_text_clean = []

#create full cleaned text by joining chapters
for i in range(len(text_chapters)):
    full_text_clean.append(' '.join(text_chapters['tokens_ns'][i]))
full_text_clean = ''.join(full_text_clean)

#Tokenize on sentences
all_sentences =  tokenize.sent_tokenize(full_text_clean)
all_sentences = [re.sub("\.", "", i) for i in all_sentences]

def words_alice(all_sentences):
    sent_with_alice = []
    for sent in all_sentences:
        #find sentences only with alice
        if sent.find("alice") != -1:
            sent_with_alice.append(sent)
    #join all sentences with alice in one string
    tokens_alice = []
    #Find only verb from each sentence
    for sent in sent_with_alice:
        text = word_tokenize(sent)
        ans = nltk.pos_tag(text)
        for word in ans:
            if word[1][0] == "V":
                tokens_alice.append(word[0])


    #all_alice = ''.join(sent_with_alice)
    #tokenize this string
    #tokens_alice = TreebankWordTokenizer().tokenize(all_alice)
    #do not include punctuation and alice
    #tokens_alice = [token for token in tokens_alice if token not in ["!","alice","?"]]
    return tokens_alice

alice_tokens = words_alice(all_sentences)
#count most popular
alice_tokens =  Counter(alice_tokens)
for word in alice_tokens.most_common(10):
    print(word)


('said', 175)
('thought', 33)
('went', 30)
('looked', 22)
('say', 21)
('began', 21)
('got', 18)
('know', 18)
('think', 15)
('see', 15)


As we see, the most common thing  Alice does - speak. She is a chatterbox). The second place verb is thinking, so she likes to analyze information, she will be great data science engineer (with tendency to speak teamlead is the best position for Alice)

In [59]:
keys = []
for i in alice_tokens.most_common(100):
    keys.append(i[0])

In [60]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    "F:/Documents/ITMO/1курс/Machine_Learning_2022/Lab4/Seminar/GoogleNews-vectors-negative300.bin", binary=True
)

In [61]:
import plotly.express as px

In [62]:
embeddings = []
words = []
clusters = []
#proceed only applicable for dict words
for word in keys:
    try:
        for similar_word, _ in model.most_similar(word, topn=30):
            words.append(similar_word)
            embeddings.append(model[similar_word])
            clusters.append(word)
    except KeyError:
        continue


In [63]:
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=15, n_components=2,n_iter=3500, random_state=32)
embeddings = tsne.fit_transform(embeddings)

In [64]:
fig = px.scatter(
    x=embeddings[:, 0],
    y=embeddings[:, 1],
    color=clusters,
    template="plotly_white",
# )
    text=words)
fig.update_traces(textposition="top center")
fig.show()