In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
import pickle

In [4]:
import re
import networkx as nx

# Preprocessing

In [5]:
file_path = 'Anne-of-Green-Gables.txt';
with open(file_path, 'r', encoding='utf8') as rf:
    book = rf.read().replace("\n\n", " ").replace("\n", " ")
    book = re.sub("[ \t]{2}[ \t]*", " ", book)
    book = book[book.find("*** START OF THE PROJECT GUTENBERG EBOOK") : book.find("*** END OF THE PROJECT GUTENBERG EBOOK")].strip()

In [6]:
# Divide text into chapters
chapters = re.split(r"CHAPTER ((X{0,3})(IX|IV|V?I{0,3}))+\.", book)

chapters = list(filter(lambda x: (x != ""), chapters))
chapters = chapters[1:] #remove first element which is just the table of content
len(chapters)

38

In [7]:
# Divide text into chapters - Alternative
chapters_alternative = book.split("CHAPTER ")[1:]
len(chapters_alternative)

76

In [8]:
nlp = spacy.load("en_core_web_md")
doc = nlp(book)

In [9]:
# Separate sentences
sentences = list(doc.sents)
sentences[100] 

Marilla and Matthew Cuthbert of all people adopting a boy!

In [10]:
def get_quotes(text):
    quotes = re.findall(r"“(.*?)”", text)
    return quotes

In [11]:
for sent in sentences:
    found_quotes = get_quotes(str(sent))
    if(len(found_quotes) > 0):
        print(found_quotes)

['ran']
['cotton warp']
['Rachel Lynde’s husband']
['I’ll just step over to Green Gables after tea and find out from Marilla where he’s gone and why,']
['It’s just _staying_, that’s what,']
['Good evening, Rachel,']
['We’re all pretty well,']
['Oh, no, I’m quite well although I had a bad headache yesterday,']
['Are you in earnest, Marilla?']
['Yes, of course,']
['What on earth put such a notion into your head?']
['Well, we’ve been thinking about it for some time--all winter in fact,']
['Well, I hope it will turn out all right,']
['Well, we’re not getting a girl,']
['Well, of all things that ever were or will be!']
['The little birds sang as if it were The one day of summer in all the year.']
['The five-thirty train has been in and gone half an hour ago,']
['I’m not expecting a girl,']
['Guess there’s some mistake,']
['I don’t understand,']
['Well, you’d better question the girl,']
['I suppose you are Mr. Matthew Cuthbert of Green Gables?']
['I’m sorry I was late,']
['Oh, I can carry it

['What will you recite if they encore you?']
['They won’t dream of encoring me,']
['country bumpkins', 'rustic belles', 'such fun']
['rustic']
['My dear, you did splendidly,']
['Oh, I can’t go,']
['Then don’t disappoint Matthew,']
['interpreted']
['Hasn’t it been a perfectly splendid time?']
['Oh, no, don’t say things like that, Jane,']
['I’ve a compliment for you, Anne,']
['Being interpreted it means plain red, I guess,']
['Titian was a very famous artist who liked to paint red-haired women.']
['_Did_ you see all the diamonds those ladies wore?']
['We _are_ rich,']
['I _don’t_ know--exactly,']
['I think diamonds would comfort a person for a good deal.']
['Well, I don’t want to be anyone but myself, even if I go uncomforted by diamonds all my life,']
['Oh, Marilla, it’s just lovely,']
['The Maiden’s Vow']
['I declare, my recitation has made you cry, Marilla,']
['Now, I call that a positive triumph.']
['No, I wasn’t crying over your piece,']
['Marilla!']
['Well now, I guess she ain’t be

# Named Entity Recognition

In [12]:
# Generate all possible character mentions
import string

personEnts = [e.text for e in doc.ents if e.label_=="PERSON"]

titles = ["Mr.", "Mrs.", "Mr", "Mrs", "Ms", "Ms.", "Mr. and Mrs.", "Dr.", "Miss", "Madame", "Professor", "Doctor", "Aunt", "Uncle", "the", "The"]
titles_lowercase = [title.lower() for title in titles]

def get_person_ents(doc):
    personEnts = [e.text for e in doc.ents if e.label_=="PERSON"]
    return personEnts

def clean_character_entities(entities):
    new_chars = []
    for entity in entities:
        names = entity.split(" ")
        for name in names:
            name = name.strip()
            if(len(name) <= 1 or bool(re.match("^[A-Z]\w+$", name)) == False or all(map(lambda x: x != name.lower(), titles_lowercase)) == False): # Exclude lowercase words which are not likely to be a person's name
                names.remove(name)
            else:
                name = re.sub("’\w*", "", name) #remove apostrophes
                #name = re.sub("[-_]", " ", name) #remove hyphens
                name = re.sub('[%s]' % re.escape(string.punctuation), '', name)
        item = " ".join(names)
        new_chars.append(item)
    
    return new_chars

def get_character_mentions(doc):
    chars = get_person_ents(doc)
    cleaned_ents = list(set(clean_character_entities(chars)))
    new_chars = [item for item in cleaned_ents]
    for item in cleaned_ents:
        names = item.split(" ")
        for name in names:
            name = name.strip()
            new_chars.append(name)
    final_chars = [] 
    for character in new_chars:
        final_chars.append(character) 
        for title in titles:
            char_with_title = f"{title} {character}"
            final_chars.append(char_with_title)
            final_chars = list(set(final_chars))
    final_chars = [char.strip() for char in final_chars]
    final_chars = [char for char in final_chars if (char != "" and all(map(lambda x: x != char.lower(), titles_lowercase)))]
    return final_chars

# Not used
def clean_character_mentions(chars): 
    chars = [char.strip() for char in chars]
    for char in chars:
        if (char == "" or all(map(lambda x: x != char.lower(), titles_lowercase)) == False):
            chars.remove(char)
    
character_mentions = get_character_mentions(doc)

In [13]:
# Adapt Kathrin's code

# goal: get dictionary of first and last names as key and their assigned full name as value
cleaned_ents = list(set(clean_character_entities(personEnts)))
char_names_arrays = list(map(lambda x: x.split(" "), cleaned_ents))
fullnameChars = [item for item in char_names_arrays if len(item) >= 2]
fullnames = list(map(lambda x: " ".join(x), fullnameChars))

# seperate name list into first and last and full names
def split_full_names(nameList):
    fnames = []
    lnames = []
    for name in nameList:
        fname = name[0]
        fnames.append(fname)
        if len(name) > 2:
            lname = name[-1] 
        else:
            lname = name[1]
        lnames.append(lname)

    return fnames, lnames
                    
fnames, lnames = split_full_names(fullnameChars)   
duplicates = [i for i in lnames if i in fnames]
duplicates
#fullnameChars
#fullnameLookup

['Diana', 'Jane', 'Anne']

In [14]:
# Adapt Kathrin's code
# count how many times each "person" occurs and sort list in descending order
cleaned_ents = clean_character_entities(personEnts)
mainCharsFreq = sorted([(person, cleaned_ents.count(person)) for person in set(cleaned_ents)], key=lambda x: x[1], reverse=True)

# list divided into char and freq list (but still descending order of frequency)
mainChars = [x[0] for x in mainCharsFreq]
freqs = [x[1] for x in mainCharsFreq]
charsFreqDict = dict(mainCharsFreq)

# separate name list into first and last and full names
# cleaned_ents = list(set(clean_character_entities(personEnts)))
cleaned_chars = list(dict.fromkeys(clean_character_entities(mainChars))) # K: to keep order of frequency (set doesn't keep order), also use mainChars for freq information
char_names_arrays = list(map(lambda x: x.split(" "), cleaned_chars))
fullnameChars = [item for item in char_names_arrays if len(item) >= 2] # this disturbs order, because it only adds name to list once full name appears in list
fullnames = list(map(lambda x: " ".join(x), fullnameChars))

def split_full_names(nameList):
    fnames = []
    lnames = []
    for name in nameList:
        fname = name[0]
        fnames.append(fname)
        if len(name) > 2: # In case the name includes multiple middle names
            lname = name[-1] 
        else:
            lname = name[1]
        lnames.append(lname)

    return fnames, lnames
                    
fnames, lnames = split_full_names(fullnameChars)

# Apply Kathrin's code to create a dictionary of first and last names as key and their assigned full name as value
fullnameLookup = {}

# since mainChars is ordered by frequency of occurence, only index of first match has to be found
for person in mainChars:
    
    # iterate over firstnames to find index of the lastname they occur with the most often
    for i, fn in enumerate(fnames):
        if fn not in fnames[:i]:
            
            # get first associated last name by index i
            ln = lnames[i]

            # rule 1: if firstname appears without lastname, but lastname is known,
            # add it to the lookup dictionary
            # e.g. "Matthew" --> "Matthew Cuthbert" 
            if (person.startswith(fn) and person in fullnames):
                    fullnameLookup[fn] = fn + " " + ln
                
            # rule 2: assign last name only to first name that occurs most often in the book
            # e.g. "Barry" is assigned to "Diana Barry", not to "Josephine Barry"
            # condition: lastname is not in list of firstnames (e.g. Jackson can be first or last name) 
            elif (person==fn and lnames.index(ln) == i and ln not in fnames and ln not in list(fullnameLookup.keys())):
                fullnameLookup[fn] = fn + " " + ln
                fullnameLookup[lnames[i]] = fn + " " + ln

fullnameLookup

In [15]:
# Adapt Kathrin's code
# replace names in mainChars list if they can be found in lookup dictionary
for i, name in enumerate(mainChars):
    if name in fullnameLookup.keys():
        mainChars[i] = fullnameLookup[name]
        
# merge new mainChars list with frequency information
mainCharsFreq = list(map(list, zip(mainChars, freqs)))

# add frequencies of occurrence of characters with the same name

# create dict to keep track of names that occur multiple times
freqDict = {}
for key, value in mainCharsFreq:
    if key not in freqDict.keys():
        freqDict[key] = [key]
    freqDict[key].append(value)
    
# turn dict values (in the form [charname, freq1, freq2, ...] into list of list
freqList = list(freqDict.values())

# sum over freq values for each charname (i.e. [charname, freq])
mainCharsFreq = [[person[0]] + [sum(person[1:])] for person in freqList]

# limitation to five main characters for now
mainCharsFreq = mainCharsFreq[:5]
mainChars = [x[0] for x in mainCharsFreq]
mainCharsFreq

[['Anne Shirley', 1116],
 ['Marilla Cuthbert', 625],
 ['Diana Barry', 492],
 ['Matthew Cuthbert', 316],
 ['Rachel Lynde', 220]]

In [16]:
# Adapt Kathrin's code
# Find character occurences

import pandas as pd
occ = pd.DataFrame(mainCharsFreq)
occ.columns = ["charname", "freq"]

# for simplicity, only search for firstnames of main characters for finding co-occurrences
aliasMains = [n.split(" ", 1)[0] for n in mainChars]

# create a cooccurrence matrix

coList = []

# iterate over list of main character first names
for i, n in enumerate(aliasMains):
    
    # initialise each main character co-occurrence list to array of zeros
    # i.e. since it's five main characters, [0, 0, 0, 0, 0]
    charCo = [0] * len(aliasMains)
    
    # find out if character appears in sentence
    for sent in doc.sents:
        sent_tokens = [token.text for token in sent]
        if n in sent_tokens:
            
            # if found in sentence, iterate over main character list again
            for j, m in enumerate(aliasMains):
                
                # if character isn't the same as currently observed, add 1
                # index j is the place in charCo array that the co-occurring character corresponds with
                if (m!=n and m in sent_tokens):
                    charCo[j] += 1
                    
    # append list of this character's occurrences after iteration over book is done
    coList.append(charCo)
    
    
# put results into DataFrame to save it as csv

coMatrix = pd.DataFrame(mainChars)
for i, l in enumerate(coList):
    coMatrix[mainChars[i]] = l
coMatrix.columns = ["charname"] + mainChars
coMatrix

Unnamed: 0,charname,Anne Shirley,Marilla Cuthbert,Diana Barry,Matthew Cuthbert,Rachel Lynde
0,Anne Shirley,0,202,94,66,16
1,Marilla Cuthbert,202,0,22,72,18
2,Diana Barry,94,22,0,6,0
3,Matthew Cuthbert,66,72,6,0,9
4,Rachel Lynde,16,18,0,9,0


In [17]:
# Test creating Entity Ruler - not used
def create_patterns(doc, ent_label):
    data = get_character_mentions(doc)
    patterns = []
    for item in data:
        pattern = {"label": ent_label, "pattern": item}
        patterns.append(pattern)
    return patterns

def create_ruler(patterns):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)

def apply_ruler(model, text):
    doc = model(text)
    results = []
    for ent in doc.ents:
        results.append(ent.text)
    return results

patterns = create_patterns(doc, "PERSON")

# Context of conversations / Topic analysis

## Preprocessing, get NER, etc.

In [20]:
# Split UNPROCESSED text into conversations
with open(file_path, 'r', encoding='utf8') as rf:
    book_cv = rf.read()
    book_cv = book_cv[book_cv.find("*** START OF THE PROJECT GUTENBERG EBOOK"):book_cv.find("*** END OF THE PROJECT GUTENBERG EBOOK")].strip()

def split_book_to_chapters(book):
    chapters_cv = re.split(r"CHAPTER ((X{0,3})(IX|IV|V?I{0,3}))+\.", book)
    chapters_cv = list(filter(lambda x: (x != ""), chapters_cv))
    chapters_cv = chapters_cv[1:]
    return chapters_cv

def get_conversations(text): # text can be a chapter or an entire book
    conversations = re.split('\n\n', text)
    conversations = list(filter(lambda x: x.startswith('“'), conversations))
    return conversations


In [21]:
chapter_dict = {}
chapters = split_book_to_chapters(book_cv)
for i, chapter in enumerate(chapters):
    chapter_dict[i+1] = [chapter]  

def find_chapter(chapter_num):
    for number, text in chapter_dict.items():
        if number == chapter_num:
            return text

def get_ents_by_chapter_dict(book):
    chapters = split_book_to_chapters(book)
    ents_by_chapter_dict = {}
    for i, chapter in enumerate(chapters):
        chapter_num = i + 1
        person_ents = []
        chapter = chapter.replace("\n\n", " ").replace("\n", " ").strip()
        doc = nlp(chapter)
        for ent in get_character_mentions(doc):
            if ent in chapter:
                person_ents.append(ent)
        ents_by_chapter_dict[chapter_num] = person_ents

    return ents_by_chapter_dict

def get_ents_by_convo_dict(text): # text can be a chapter or an entire book
    ents_by_convo_dict = {}
    conversations = get_conversations(text)
    convo_num = 0 # temporary keys - needs more meaningful keys
    for cv in conversations:
        convo_num += 1 
        person_ents = []
        cv = cv.replace("\n\n", " ").replace("\n", " ").strip()
        doc = nlp(cv)
        for ent in get_character_mentions(doc):
            if ent in cv:
                person_ents.append(ent)
        ents_by_convo_dict[convo_num] = person_ents

    return ents_by_convo_dict

## Term Frequency

### Prepare and clean dataframes and document-term matrix

In [22]:
pd.set_option("max_colwidth", 150)

chapters_df = pd.DataFrame(chapter_dict).transpose()
chapters_df.columns = ['text']

In [23]:
# Clean chapter text
def lemmatize(text, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # disable redundant packages to reduce running time
    doc = nlp(text)
    lemmatized_text = []
    for token in doc:
        if token.pos_ in allowed_postags and len(token) >= 2:
            lemmatized_text.append(token.lemma_)
    lemmatized_text = " ".join(lemmatized_text)
    return lemmatized_text

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\w*\d\w*', '', text) # remove numeric values
    text = re.sub('(\n)+', ' ', text)
    text = lemmatize(text)
    return text

cleaned_chapters_df = pd.DataFrame(chapters_df.text.apply(lambda x: clean_text(x)))

In [24]:
cleaned_chapters_df.head()

Unnamed: 0,text
1,lynde surprise lynde live just avonlea main road dip little hollow fringe alder lady eardrop traverse brook source away back wood old cuthbert pla...
2,matthew cuthbert surprised matthew cuthbert jog comfortably mile bright river pretty road run along farmstead now again bit balsamy fir wood drive...
3,cuthbert surprised marilla come briskly forward matthew open door eye fall odd little figure stiff ugly dress long braid red hair eager luminous e...
4,morning green gable broad daylight awake sit bed stare confusedly window flood cheery sunshine pour white feathery wave glimpse blue sky moment re...
5,anne history know say confidentially ve make mind enjoy drive experience nearly always enjoy thing make mind firmly of course make firmly go think...


In [25]:
# Pickle df for later
chapters_df.to_pickle("corpus.pkl")

In [26]:
#create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(cleaned_chapters_df.text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = cleaned_chapters_df.index

In [27]:
data_dtm.to_pickle("dtm.pkl")
cleaned_chapters_df.to_pickle('cleaned_chapters_df.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

### Top words per character

In [55]:
quotedict = pd.read_pickle('quotedict.pickle')
for key, value in quotedict.items() :
    print (key)

Anne Shirley
Marilla Cuthbert
Diana Barry
Matthew Cuthbert
Rachel Lynde
Gilbert Blythe
Jane Andrews
Josie Pye
Ruby Gillis
Mrs. Allan
Miss Stacy
Mrs. Barry
Josephine Barry
Mr. Phillips
Alexander Spencer


In [29]:
quotes_df = pd.DataFrame([quotedict]).transpose() #square brackets around quotedict are necessary to avoid ValueError: If using all scalar values, you must pass an index
quotes_df.columns = ['text']
quotes_df

Unnamed: 0,text
Anne Shirley,"“I shall never forgive Gilbert Blythe,” “And Mr. Phillips spelled my name without an e, too. The iron has entered into my soul, Diana.” “I am very..."
Marilla Cuthbert,"“Matthew Cuthbert, who’s that?” “Where is the boy?” “This is a real fine evening, isn’t it? Won’t you sit down? How are all your folks?” “Good eve..."
Diana Barry,"“I never heard of but one kind,” “There really is another. Oh, it isn’t wicked at all. It just means vowing and promising solemnly.” “Well, I don’..."
Matthew Cuthbert,"“There wasn’t any boy,” “There was only her.” “I’m not expecting a girl,” “It’s a boy I’ve come for. He should be here. Mrs. Alexander Spencer was..."
Rachel Lynde,"“I’ll just step over to Green Gables after tea and find out from Marilla where he’s gone and why,” “He doesn’t generally go to town this time of y..."
Gilbert Blythe,"“I’m awfully sorry I made fun of your hair, Anne,” “Honest I am. Don’t be mad for keeps, now.” “Bingen on the Rhine” “Anne Shirley! How on earth d..."
Jane Andrews,"“Now, she’s all ready,” “We must kiss her quiet brows and, Diana, you say, ‘Sister, farewell forever,’ and Ruby, you say, ‘Farewell, sweet sister,..."
Josie Pye,"“Are you going to be back next year, Miss Stacy?” “I don’t care,” “If I don’t pass this year I’m coming back next. My father can afford to send me..."
Ruby Gillis,"“Nor I,” “I don’t mind floating down when there’s two or three of us in the flat and we can sit up. It’s fun then. But to lie down and pretend I w..."
Mrs. Allan,"“In that case I must sample it,” “When Matthew was here he liked to hear you laugh and he liked to know that you found pleasure in the pleasant th..."


In [30]:
cleaned_quotes_df = pd.DataFrame(quotes_df.text.apply(lambda x: clean_text(x)))
cleaned_quotes_df

Unnamed: 0,text
Anne Shirley,never forgive gilbert blythe spell name too iron enter soul very sorry like seem interesting lady even kindre spirit look very much duty go home m...
Marilla Cuthbert,cuthbert boy real fine evening sit folk good evening rachel quite well bad headache yesterday earnest marilla course ve think time winter fact up ...
Diana Barry,never hear kind there really wicked at all just mean vow promise solemnly mind do re queer girl anne hear queer believe go like real well good jus...
Matthew Cuthbert,boy only expect girl boy ve come here bring understand well now dunno well now dunno well now seem reasonable spencer say tongue hang middle isnt ...
Rachel Lynde,ill just step green gable tea find marilla go generally go town time year never visit run turnip seed dress take buggy go more drive fast enough g...
Gilbert Blythe,awfully sorry make fun hair honest mad keep now bingen rhine anne earth get there carrot carrot priscilla sweet take thing home happen anne drift ...
Jane Andrews,now all ready kiss quiet brow say sister farewell forever say sweet sister as sorrowfully possibly anne goodness sake smile little know lie smile ...
Josie Pye,go back next year miss stacy care pass year come back next father afford send say professor tremaine say gilbert sure get medal emily clay likely ...
Ruby Gillis,mind float flat sit fun then lie pretend dead just die really fright look really dead make feel frightened girl suppose really right act lynde say...
Mrs. Allan,case sample matthew here like hear laugh like know find pleasure pleasant thing just away now like know just same sure shut heart healing influenc...


In [31]:
quotes_cv = CountVectorizer(stop_words='english')
quotes_data_cv = quotes_cv.fit_transform(cleaned_quotes_df.text)
quotes_data_dtm = pd.DataFrame(quotes_data_cv.toarray(), columns=quotes_cv.get_feature_names())
quotes_data_dtm.index = cleaned_quotes_df.index
quotes_data_dtm = quotes_data_dtm.transpose()
quotes_data_dtm

Unnamed: 0,Anne Shirley,Marilla Cuthbert,Diana Barry,Matthew Cuthbert,Rachel Lynde,Gilbert Blythe,Jane Andrews,Josie Pye,Ruby Gillis,Mrs. Allan,Miss Stacy,Mrs. Barry,Josephine Barry,Mr. Phillips,Alexander Spencer
abbey,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
abet,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
able,16,3,1,1,0,0,0,0,0,0,0,0,0,0,0
abominably,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
absolutely,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yesterday,5,3,0,0,1,0,0,0,0,0,0,0,0,0,1
youll,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
young,6,7,3,0,1,0,0,0,0,0,0,0,0,0,0
youre,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0


In [32]:
top_words_quotes_dict = {}
for character in quotes_data_dtm.columns:
    top = quotes_data_dtm[character].sort_values(ascending=False).head(30)
    top_words_quotes_dict[character] = list(zip(top.index, top.values))

top_words_quotes_dict

{'Anne Shirley': [('think', 198),
  ('say', 198),
  ('marilla', 173),
  ('just', 156),
  ('know', 118),
  ('make', 89),
  ('tell', 88),
  ('look', 81),
  ('feel', 81),
  ('come', 81),
  ('good', 80),
  ('thing', 77),
  ('ve', 71),
  ('little', 68),
  ('girl', 62),
  ('ill', 61),
  ('imagine', 60),
  ('time', 60),
  ('really', 56),
  ('like', 48),
  ('lynde', 46),
  ('want', 42),
  ('hair', 41),
  ('mean', 40),
  ('love', 39),
  ('matthew', 39),
  ('right', 37),
  ('let', 37),
  ('ask', 37),
  ('lovely', 37)],
 'Marilla Cuthbert': [('say', 102),
  ('know', 74),
  ('think', 69),
  ('good', 63),
  ('anne', 59),
  ('ve', 58),
  ('thing', 53),
  ('make', 48),
  ('tell', 47),
  ('just', 46),
  ('come', 46),
  ('want', 40),
  ('matthew', 40),
  ('girl', 37),
  ('marilla', 36),
  ('talk', 34),
  ('child', 33),
  ('look', 33),
  ('right', 33),
  ('ill', 33),
  ('little', 30),
  ('time', 29),
  ('let', 26),
  ('hear', 26),
  ('school', 25),
  ('suppose', 25),
  ('home', 24),
  ('stay', 23),
  ('

### Top words per chapter

In [33]:
data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,29,30,31,32,33,34,35,36,37,38
abandonment,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abasement,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
abash,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
abated,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
abbey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [34]:
# Find top 30 words in each chapter
top_words_dict = {}
for chapter_num in data.columns:
    top = data[chapter_num].sort_values(ascending=False).head(30)
    top_words_dict[chapter_num] = list(zip(top.index, top.values))

top_words_dict

{1: [('matthew', 24),
  ('say', 17),
  ('marilla', 15),
  ('know', 12),
  ('cuthbert', 12),
  ('boy', 11),
  ('little', 10),
  ('green', 10),
  ('avonlea', 9),
  ('orphan', 8),
  ('child', 8),
  ('good', 8),
  ('gable', 8),
  ('sit', 7),
  ('think', 7),
  ('asylum', 7),
  ('lynde', 6),
  ('come', 6),
  ('road', 6),
  ('hollow', 6),
  ('live', 6),
  ('house', 6),
  ('home', 6),
  ('white', 5),
  ('bright', 5),
  ('run', 5),
  ('people', 5),
  ('thing', 5),
  ('away', 5),
  ('set', 5)],
 2: [('say', 40),
  ('matthew', 31),
  ('little', 21),
  ('imagine', 20),
  ('come', 19),
  ('just', 19),
  ('look', 18),
  ('think', 17),
  ('thing', 17),
  ('girl', 15),
  ('know', 14),
  ('eye', 14),
  ('long', 14),
  ('feel', 14),
  ('white', 13),
  ('ve', 13),
  ('make', 12),
  ('pretty', 11),
  ('drive', 11),
  ('red', 11),
  ('glad', 10),
  ('green', 10),
  ('road', 10),
  ('child', 10),
  ('place', 10),
  ('home', 10),
  ('talk', 9),
  ('away', 9),
  ('people', 9),
  ('ask', 9)],
 3: [('say', 23),

In [35]:
# Add the most common top words among all chapters to the stop word list
from collections import Counter

words = []
for chapter_num in data.columns:
    top_words = [word for (word, count) in top_words_dict[chapter_num]]
    for w in top_words:
        words.append(w)
        
# Identify the most common words along with how many chapters they occur in
Counter(words).most_common()

[('say', 38),
 ('think', 38),
 ('anne', 36),
 ('know', 34),
 ('marilla', 33),
 ('just', 33),
 ('come', 32),
 ('little', 28),
 ('good', 28),
 ('thing', 28),
 ('make', 27),
 ('look', 26),
 ('tell', 25),
 ('ve', 24),
 ('time', 22),
 ('matthew', 21),
 ('girl', 21),
 ('feel', 21),
 ('lynde', 14),
 ('want', 14),
 ('child', 12),
 ('eye', 12),
 ('home', 11),
 ('gable', 9),
 ('long', 9),
 ('green', 8),
 ('white', 8),
 ('face', 8),
 ('ill', 8),
 ('old', 8),
 ('people', 7),
 ('away', 7),
 ('imagine', 7),
 ('school', 7),
 ('gilbert', 7),
 ('sit', 6),
 ('talk', 6),
 ('year', 6),
 ('read', 6),
 ('hand', 6),
 ('night', 6),
 ('hair', 6),
 ('really', 6),
 ('right', 6),
 ('jane', 6),
 ('avonlea', 5),
 ('ask', 5),
 ('way', 5),
 ('friend', 5),
 ('room', 5),
 ('day', 5),
 ('head', 5),
 ('heart', 5),
 ('bad', 5),
 ('miss', 5),
 ('queen', 5),
 ('boy', 4),
 ('road', 4),
 ('pretty', 4),
 ('glad', 4),
 ('bed', 4),
 ('bring', 4),
 ('mean', 4),
 ('love', 4),
 ('big', 4),
 ('nice', 4),
 ('suppose', 4),
 ('try', 4)

In [36]:
# If around 1/3 of the chapters (= 38/3 ~ 12) have a word as a top word, exclude it from the list
additional_stop_words = [word for word, count in Counter(words).most_common() if count > 12]
additional_stop_words
# REMARK: Names of all main characters appear when setting count > 12 --> Kept count > 12 under the assumption that names of main characters do not provide much informaation as to the distinctive topic(s) of a chapter

['say',
 'think',
 'anne',
 'know',
 'marilla',
 'just',
 'come',
 'little',
 'good',
 'thing',
 'make',
 'look',
 'tell',
 've',
 'time',
 'matthew',
 'girl',
 'feel',
 'lynde',
 'want']

In [37]:
# Update document-term matrix with the additional stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

data_cleaned = pd.read_pickle('cleaned_chapters_df.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)

# Add to stop words list the stem words of current stop words
stop_words_string = ""
stop_words_lemma = []
for sw in stop_words:
    stop_words_string += sw + " "
    
doc_sw = nlp(stop_words_string)
stop_words_lemma = [word.lemma_ for word in doc_sw]
stop_words = stop_words.union(stop_words_lemma)

# Add contraction words to stop words list
contraction_words = ["isnt", "arent", "aint", "wasnt", "werent", "theres", "therere", "thats", "thatd", "ive", "weve", "theyve", "youre", "were", "theyre", "hes", "shes", "its", "havent", "hasnt", "mightnt", "maynt", "neednt", "mayve", "mightve", "couldve", "didnt", "dont", "wont", "cant", "couldnt", "shouldnt", "oughtnt", "mustnt", "mustve", "shouldve", "shant", "ill", "youll", "theyll", "itll", "id", "youd", "hed", "theyd", "itd", "im", "maam", "hows", "gonna"]
stop_words = stop_words.union(contraction_words)

# Add contraction words to stop words list
other_stop_words = ["thing", "things", "person", "people", "know", "think", "suppose", "let", "make", "like", "really", "just", "quite", "here", "there"]
stop_words = stop_words.union(other_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_cleaned.text)
dtm_new_stopw = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
dtm_new_stopw.index = data_cleaned.index

pickle.dump(cv, open("cv_new_stopw.pkl", "wb"))
dtm_new_stopw.to_pickle("dtm_new_stopw.pkl")

In [38]:
data = dtm_new_stopw.transpose()
for chapter_num in data.columns:
    top = data[chapter_num].sort_values(ascending=False).head(30)
    top_values_native_int = map(lambda x: int(x), top.values) # use the built-in int() (or float) constructor to convert the numpy integer to a native Python integer before serializing it, because json.dumps() method doesn't handle numpy integers by default (or else get TypeError: Object of type int64 is not JSON serializable).
    top_words_dict[chapter_num] = list(zip(top.index, top_values_native_int))

top_words_dict

{1: [('cuthbert', 12),
  ('boy', 11),
  ('green', 10),
  ('avonlea', 9),
  ('gable', 8),
  ('child', 8),
  ('orphan', 8),
  ('sit', 7),
  ('asylum', 7),
  ('road', 6),
  ('house', 6),
  ('home', 6),
  ('hollow', 6),
  ('live', 6),
  ('set', 5),
  ('bright', 5),
  ('brook', 5),
  ('away', 5),
  ('white', 5),
  ('man', 5),
  ('run', 5),
  ('risk', 4),
  ('river', 4),
  ('step', 4),
  ('course', 4),
  ('afternoon', 4),
  ('ask', 4),
  ('hill', 4),
  ('bring', 4),
  ('expect', 4)],
 2: [('imagine', 20),
  ('long', 14),
  ('eye', 14),
  ('white', 13),
  ('red', 11),
  ('drive', 11),
  ('pretty', 11),
  ('green', 10),
  ('glad', 10),
  ('home', 10),
  ('road', 10),
  ('place', 10),
  ('child', 10),
  ('talk', 9),
  ('ask', 9),
  ('away', 9),
  ('big', 8),
  ('beautiful', 8),
  ('use', 8),
  ('asylum', 8),
  ('tree', 8),
  ('expect', 7),
  ('hair', 7),
  ('wild', 7),
  ('gable', 7),
  ('dream', 7),
  ('love', 6),
  ('sit', 6),
  ('train', 6),
  ('live', 6)],
 3: [('boy', 12),
  ('bed', 11),
 

### Top words per character (cont.)

In [59]:
# Recreate top words dict per character
quotes_cv = CountVectorizer(stop_words=stop_words.union(['ii', 'itoh', 'mean', 'ba', 'right', 'away', 'talk', 'believe', 'guess']))
quotes_data_cv = quotes_cv.fit_transform(cleaned_quotes_df.text)
quotes_data_new_dtm = pd.DataFrame(quotes_data_cv.toarray(), columns=quotes_cv.get_feature_names())
quotes_data_new_dtm.index = cleaned_quotes_df.index
quotes_data_new_dtm = quotes_data_new_dtm.transpose()
quotes_data_new_dtm

Unnamed: 0,Anne Shirley,Marilla Cuthbert,Diana Barry,Matthew Cuthbert,Rachel Lynde,Gilbert Blythe,Jane Andrews,Josie Pye,Ruby Gillis,Mrs. Allan,Miss Stacy,Mrs. Barry,Josephine Barry,Mr. Phillips,Alexander Spencer
abbey,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
abet,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
able,16,3,1,1,0,0,0,0,0,0,0,0,0,0,0
abominably,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
absolutely,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0
yes,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
yesterday,5,3,0,0,1,0,0,0,0,0,0,0,0,0,1
young,6,7,3,0,1,0,0,0,0,0,0,0,0,0,0


In [60]:
for character in quotes_data_new_dtm.columns:
    top = quotes_data_new_dtm[character].sort_values(ascending=False).head(30)
    top_values_native_int = map(lambda x: int(x), top.values) 
    top_words_quotes_dict[character] = list(zip(top.index, top_values_native_int))
    top_words_quotes_dict[character] = list(filter(lambda x: x[1] > 0, top_words_quotes_dict[character]))
    
top_words_quotes_dict

{'Anne Shirley': [('imagine', 60),
  ('hair', 41),
  ('love', 39),
  ('ask', 37),
  ('lovely', 37),
  ('glad', 36),
  ('course', 35),
  ('live', 35),
  ('miss', 34),
  ('pretty', 33),
  ('school', 33),
  ('way', 32),
  ('night', 32),
  ('stacy', 29),
  ('use', 29),
  ('home', 29),
  ('read', 29),
  ('life', 29),
  ('long', 28),
  ('nice', 28),
  ('green', 27),
  ('grow', 27),
  ('try', 26),
  ('day', 26),
  ('mind', 26),
  ('stay', 26),
  ('hear', 26),
  ('friend', 26),
  ('morning', 25),
  ('red', 25)],
 'Marilla Cuthbert': [('child', 33),
  ('hear', 26),
  ('school', 25),
  ('home', 24),
  ('real', 23),
  ('stay', 23),
  ('bring', 20),
  ('bad', 19),
  ('hair', 19),
  ('use', 18),
  ('try', 17),
  ('old', 17),
  ('learn', 16),
  ('mind', 16),
  ('hope', 15),
  ('way', 14),
  ('night', 13),
  ('man', 13),
  ('boy', 13),
  ('work', 13),
  ('send', 13),
  ('prayer', 13),
  ('matter', 13),
  ('nice', 12),
  ('word', 12),
  ('tea', 12),
  ('head', 12),
  ('course', 12),
  ('leave', 12),
 

### Top words per chapter (cont.) - Switch to ngrams

In [41]:
!pip3 install gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim import matutils, models

[0m

In [42]:
chapters_texts = list(cleaned_chapters_df["text"].values)
chapters_texts_split = [text.split() for text in chapters_texts]

bigram_phrases = gensim.models.Phrases(chapters_texts_split, min_count=2, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[chapters_texts_split], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def get_bigrams(texts):
    return([bigram[doc] for doc in texts])

def get_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

chapters_bigrams = get_bigrams(chapters_texts_split)
chapters_bigrams_trigrams = get_trigrams(chapters_bigrams)

def exclude_stop_words(word_list):
    word_list = list(filter(lambda x: x not in stop_words and len(x) >= 2, word_list))
    return word_list

chapters_bigrams_trigrams = [exclude_stop_words(word_list) for word_list in chapters_bigrams_trigrams]

print(chapters_bigrams_trigrams)



In [43]:
chapters_ngrams_dict = {}
for i, chapter in enumerate(chapters_bigrams_trigrams):
    chapters_ngrams_dict[i+1] = [" ".join(chapter)]
pd.set_option("max_colwidth", 150)

chapters_ngrams_df = pd.DataFrame(chapters_ngrams_dict).transpose()
chapters_ngrams_df.columns = ['text']
chapters_ngrams_df

Unnamed: 0,text
1,surprise live avonlea main_road dip hollow fringe alder lady eardrop traverse brook source away wood old cuthbert place repute intricate headlong ...
2,matthew_cuthbert surprised matthew_cuthbert jog comfortably mile bright_river pretty road run farmstead bit balsamy fir wood drive hollow wild_plu...
3,cuthbert surprised briskly forward open_door eye fall odd figure stiff ugly dress long braid red_hair eager luminous eye stop short amazement matt...
4,morning green_gable broad daylight awake sit bed stare confusedly window flood cheery sunshine pour white feathery wave glimpse blue sky moment re...
5,history confidentially mind enjoy drive experience nearly enjoy mind firmly course firmly asylum drive drive early wild rise lovely glad rose nice...
6,mind season spencer live big yellow house white_sand cove door surprise welcome mingle benevolent face dear dear exclaim folk today real glad hors...
7,prayer bed night notice last_night throw clothe floor untidy habit allow at_all as_soon article clothing fold neatly place chair use at_all neat h...
8,bringingup begin reason known stay green_gable afternoon forenoon child busy various task watch keen eye noon conclude smart obedient willing work...
9,properly horrified fortnight green_gable arrive inspect justice blame severe unseasonable attack grippe confine lady house occasion visit green_ga...
10,apology affair evening prove refractory next_morning explanation account absence breakfast table story pain impress sense enormity behavior callin...


In [44]:
cv = CountVectorizer(stop_words=stop_words)
data_ngrams_cv = cv.fit_transform(chapters_ngrams_df.text)
dtm_ngrams_new_stopw = pd.DataFrame(data_ngrams_cv.toarray(), columns=cv.get_feature_names())
dtm_ngrams_new_stopw.index = chapters_ngrams_df.index
dtm_ngrams = dtm_ngrams_new_stopw.transpose()

In [45]:
top_ngrams_dict = {}
for chapter_num in dtm_ngrams.columns:
    top = dtm_ngrams[chapter_num].sort_values(ascending=False).head(40)
    top_values_native_int = map(lambda x: int(x), top.values) 
    top_ngrams_dict[chapter_num] = list(zip(top.index, top_values_native_int))

top_ngrams_dict

{1: [('boy', 11),
  ('avonlea', 9),
  ('child', 8),
  ('green_gable', 8),
  ('sit', 7),
  ('matthew_cuthbert', 7),
  ('home', 6),
  ('orphan_asylum', 6),
  ('live', 6),
  ('house', 6),
  ('cuthbert', 5),
  ('run', 5),
  ('brook', 5),
  ('away', 5),
  ('man', 5),
  ('set', 5),
  ('hollow', 5),
  ('bright_river', 4),
  ('ask', 4),
  ('bring', 4),
  ('course', 4),
  ('expect', 4),
  ('risk', 4),
  ('step', 4),
  ('hill', 4),
  ('afternoon', 4),
  ('window', 4),
  ('place', 4),
  ('turn', 4),
  ('mind', 4),
  ('use', 4),
  ('lane', 4),
  ('buggy', 3),
  ('hear', 3),
  ('wood', 3),
  ('adopt', 3),
  ('knit', 3),
  ('company', 3),
  ('road', 3),
  ('main_road', 3)],
 2: [('imagine', 20),
  ('long', 14),
  ('white', 13),
  ('drive', 11),
  ('eye', 11),
  ('home', 10),
  ('pretty', 10),
  ('road', 10),
  ('place', 10),
  ('glad', 10),
  ('child', 10),
  ('away', 9),
  ('talk', 9),
  ('big', 8),
  ('tree', 8),
  ('beautiful', 8),
  ('ask', 8),
  ('use', 8),
  ('red', 7),
  ('dream', 7),
  ('exp

In [46]:
# Pull out the top 30 words for all chapters after adding stopwords
ngrams = []
for chapter_num in dtm_ngrams.columns:
    top_words = [word for (word, count) in top_ngrams_dict[chapter_num]]
    for w in top_words:
        ngrams.append(w)
        
# Identify the most common ngrams along with how many chapters they occur in
top_ngrams_all_chapters = Counter(ngrams).most_common(40)
top_ngrams_all_chapters

[('eye', 22),
 ('home', 20),
 ('child', 18),
 ('hear', 17),
 ('long', 16),
 ('talk', 16),
 ('day', 16),
 ('sit', 15),
 ('mean', 15),
 ('imagine', 14),
 ('head', 14),
 ('away', 13),
 ('school', 13),
 ('old', 13),
 ('heart', 13),
 ('ask', 12),
 ('mind', 12),
 ('glad', 12),
 ('night', 12),
 ('way', 12),
 ('bring', 11),
 ('course', 11),
 ('white', 11),
 ('right', 11),
 ('face', 11),
 ('bad', 11),
 ('big', 10),
 ('nice', 10),
 ('love', 10),
 ('life', 10),
 ('believe', 10),
 ('room', 10),
 ('avonlea', 9),
 ('green_gable', 9),
 ('afternoon', 9),
 ('use', 9),
 ('try', 9),
 ('hard', 9),
 ('hope', 9),
 ('real', 9)]

In [47]:
# Pull out the top 30 words for all chapters after adding stopwords
ngrams = []
for chapter_num in dtm_ngrams.columns:
    top_words = [(word, count) for (word, count) in top_ngrams_dict[chapter_num]]
    for (word, count) in top_words:
        ngrams.append((word, count))

# Identify the most common ngrams along with how many times they appear (DIFFERENT from above version of top_ngrams_all_chapters)
ngrams_unique = {word:0 for word, _ in ngrams}
  
for word, count in ngrams: 
    ngrams_unique[word] += count

ngrams_unique = list(map(tuple, ngrams_unique.items()))

top_ngrams_all_chapters_new = list(sorted(ngrams_unique, key=lambda x: x[1], reverse=True))[:40]
top_ngrams_all_chapters_new

[('home', 122),
 ('eye', 119),
 ('child', 114),
 ('school', 112),
 ('sit', 92),
 ('imagine', 84),
 ('long', 77),
 ('talk', 76),
 ('old', 75),
 ('day', 74),
 ('away', 69),
 ('mean', 69),
 ('hear', 67),
 ('hair', 67),
 ('ask', 63),
 ('heart', 63),
 ('face', 61),
 ('glad', 60),
 ('way', 60),
 ('room', 60),
 ('love', 59),
 ('jane', 58),
 ('white', 57),
 ('head', 57),
 ('avonlea', 56),
 ('night', 56),
 ('dress', 55),
 ('boy', 54),
 ('gilbert', 54),
 ('big', 50),
 ('right', 49),
 ('bring', 48),
 ('course', 47),
 ('bad', 47),
 ('use', 46),
 ('pretty', 46),
 ('nice', 46),
 ('live', 45),
 ('green_gable', 44),
 ('mind', 44)]

In [48]:
top_ngrams_dict_full = {}
top_ngrams_dict_full[0] = top_ngrams_all_chapters
for chapter_num in dtm_ngrams.columns:
    top_ngrams_dict_full[chapter_num] = top_ngrams_dict[chapter_num]
top_ngrams_dict_full

{0: [('eye', 22),
  ('home', 20),
  ('child', 18),
  ('hear', 17),
  ('long', 16),
  ('talk', 16),
  ('day', 16),
  ('sit', 15),
  ('mean', 15),
  ('imagine', 14),
  ('head', 14),
  ('away', 13),
  ('school', 13),
  ('old', 13),
  ('heart', 13),
  ('ask', 12),
  ('mind', 12),
  ('glad', 12),
  ('night', 12),
  ('way', 12),
  ('bring', 11),
  ('course', 11),
  ('white', 11),
  ('right', 11),
  ('face', 11),
  ('bad', 11),
  ('big', 10),
  ('nice', 10),
  ('love', 10),
  ('life', 10),
  ('believe', 10),
  ('room', 10),
  ('avonlea', 9),
  ('green_gable', 9),
  ('afternoon', 9),
  ('use', 9),
  ('try', 9),
  ('hard', 9),
  ('hope', 9),
  ('real', 9)],
 1: [('boy', 11),
  ('avonlea', 9),
  ('child', 8),
  ('green_gable', 8),
  ('sit', 7),
  ('matthew_cuthbert', 7),
  ('home', 6),
  ('orphan_asylum', 6),
  ('live', 6),
  ('house', 6),
  ('cuthbert', 5),
  ('run', 5),
  ('brook', 5),
  ('away', 5),
  ('man', 5),
  ('set', 5),
  ('hollow', 5),
  ('bright_river', 4),
  ('ask', 4),
  ('bring', 

In [49]:
import json
def save_data(file, data):
    with open(file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [50]:
save_data("top_words_by_chapter.json", top_words_dict)

In [51]:
save_data("top_ngrams_by_chapter.json", top_ngrams_dict_full)

In [61]:
save_data("top_words_quotes_by_character.json", top_words_quotes_dict)

In [53]:
save_data("top_words_quotes_by_character_new.json", top_words_quotes_dict)

## Topic Modelling with LDA

In [None]:
!pip3 install gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim import matutils, models

In [None]:
# Trial attempt - Sparse Matrix
import scipy.sparse
sparse_counts = scipy.sparse.csr_matrix(data)
corpus = matutils.Sparse2Corpus(sparse_counts)
cv = pickle.load(open("cv_new_stopw.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=6, passes=50)
lda.print_topics()

In [None]:
# Trial attempt - including only Nouns and Adjectives
def nouns_adj(text):
    doc = nlp(text)
    nouns_adjs = []
    for token in doc: 
        if token.pos_ == "NN" or token.pos_ == "ADJ":
            nouns_adjs.append(token.text)
    return ' '.join(nouns_adjs)

data_nouns_adjs = pd.DataFrame(cleaned_chapters_df.text.apply(nouns_adj))
data_nouns_adjs

In [None]:
cv_noun_adj = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cv_noun_adj = cv_noun_adj.fit_transform(data_nouns_adjs.text)
dtm_noun_adj = pd.DataFrame(data_cv_noun_adj.toarray(), columns=cv_noun_adj.get_feature_names())
dtm_noun_adj.index = data_nouns_adjs.index

#Create gensim corpus
corpus_noun_adj = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dtm_noun_adj.transpose()))

# Create vocabulary dictionary
id2word_noun_adj = dict((v, k) for k, v in cv_noun_adj.vocabulary_.items())

# Apply LDA
lda_noun_adj = models.LdaModel(corpus=corpus_noun_adj, num_topics=4, id2word=id2word_noun_adj, passes=50)
lda_noun_adj.print_topics()

In [None]:
corpus_transformed = lda_noun_adj[corpus_noun_adj]
list(zip([a for [(a,b)] in corpus_transformed], dtm_noun_adj.index))

In [None]:
# Final approach
chapters_texts = list(cleaned_chapters_df["text"].values)
chapters_texts_split = [text.split() for text in chapters_texts]

bigram_phrases = gensim.models.Phrases(chapters_texts_split, min_count=2, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[chapters_texts_split], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def get_bigrams(texts):
    return([bigram[doc] for doc in texts])

def get_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

chapters_bigrams = get_bigrams(chapters_texts_split)
chapters_bigrams_trigrams = get_trigrams(chapters_bigrams)

def exclude_stop_words(word_list):
    word_list = list(filter(lambda x: x not in stop_words and len(x) >= 2, word_list))
    return word_list

chapters_bigrams_trigrams = [exclude_stop_words(word_list) for word_list in chapters_bigrams_trigrams]

print(chapters_bigrams_trigrams[37])

In [None]:
from gensim.models import TfidfModel

id2word = corpora.Dictionary(chapters_bigrams_trigrams)

texts = chapters_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts] 

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.02
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # exclude words with tf-idf score = 0 

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow
    
num_topics = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=50,
                                           alpha="auto")

lda_model.print_topics(num_words=40)

In [None]:
num_topics = 7
lda_model_7 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=100,
                                           alpha="auto")

lda_results = lda_model_7.print_topics(num_words=40)

In [None]:
from itertools import chain

lda_corpus = lda_model_7[corpus]
# Sort topics' probabilities in each chapter descendingly
lda_corpus = [sorted(array, key=lambda i:i[1],reverse=True) for array in lda_corpus]

cluster1 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 0] # i[0] is selected so that only the topic with the highest probability is assigned to the chapter
cluster2 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 1]
cluster3 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 2]
cluster4 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 3]
cluster5 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 4]
cluster6 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 5]
cluster7 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 6]

topic_clusters = [cluster1, cluster2, cluster3, cluster4, cluster5, cluster6, cluster7]
print(cluster1)
print(cluster2)
print(cluster3)
print(cluster4)
print(cluster5)
print(cluster6)
print(cluster7)
topic_clusters

In [None]:
lda_results

In [None]:
# (using directly lda results) For 7 topics
top_terms_arr_per_topic_7 = []

for topic in lda_results:
    top_terms_arr_per_topic_7.append({"topic": topic[0]+1, "chapters": topic_clusters[topic[0]], "terms": []})

for item in top_terms_arr_per_topic_7:    
    terms = lda_results[item['topic']-1][1].split(" + ")
    top_terms = []
    for term in terms:
        word = term.split("*")[1]
        word = re.sub('["]', '', word)
        value = term.split("*")[0]
        top_terms.append({"name": word, "value": float(value)})
    item['terms'] = top_terms
top_terms_arr_per_topic_7

In [None]:
save_data("top_terms_arr_per_topic_7.json", top_terms_arr_per_topic_7)

In [None]:
# For 7 topics
topics_terms = lda_model_7.state.get_lambda() 
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
ttm = pd.DataFrame(topics_terms_proba, columns=lda_model_7.id2word.values(), index=[f'topic {i+1}' for i in range(lda_model_7.num_topics)]).transpose()
values = [np.sort(arr)[::-1] for arr in ttm.values]

ttm = pd.DataFrame(values, ttm.index, ttm.columns)
ttm['topic 5'].picnic

In [None]:
# (using ttm) For 7 topics
top_terms_dict_per_topic_7 = {}

for topic in ttm.columns:
    top = ttm[topic].sort_values(ascending=False).head(30)
    #(np.sort(ttm.topic)[::-1][:30])
    top_values_to_string = map(lambda x: str(x), top.values)
    key_value_arr = list(zip(top.index, top_values_to_string))
    top_terms_dict_per_topic_7[topic] = {"chapters": topic_clusters[int(topic[-1]) - 1], "terms": [{"key": key, "value": value} for key, value in key_value_arr]}
    
top_terms_dict_per_topic_7

In [None]:
# Get (sorted) document-topic distribution
sorted_doc_topic_dists = []
for bow in corpus:
    doc_topic_dist = lda_model.get_document_topics(bow, minimum_probability=0.0)
    doc_topic_dist = sorted(doc_topic_dist, key=lambda i:i[1], reverse=True)
    sorted_doc_topic_dists.append(doc_topic_dist)

sorted_doc_topic_dists

In [None]:
# Create document-topic matrix
unsorted_doc_topic_dists = [lda_model.get_document_topics(bow, minimum_probability=0.0) for bow in corpus] # unsorted doc_topic_dists
unsorted_doc_topic_dists = [[tupel[1] for tupel in arr] for arr in unsorted_doc_topic_dists]
dtopicm = pd.DataFrame(unsorted_doc_topic_dists, columns=[f'topic {i+1}' for i in range(lda_model.num_topics)], index=[f'chapter {i+1}' for i in range(len(corpus))])

doc_topic_dists = dtopicm.values
dtopicm

In [None]:
# Get documents' (i.e. bigrams and trigrams in each chapter') lengths
doc_lengths = []
for chapter in chapters_bigrams_trigrams:
    doc_lengths.append(len(chapter)) 
    
np.array(doc_lengths).shape

In [None]:
# Get vocab in the corpus
vocab = list(id2word.values())

In [None]:
from itertools import chain

lda_corpus = lda_model[corpus]
# Sort topics' probabilities in each chapter descendingly
lda_corpus = [sorted(array, key=lambda i:i[1],reverse=True) for array in lda_corpus]

list(zip([j for i, j in zip(lda_corpus,cleaned_chapters_df.index)], [i for i, j in zip(lda_corpus,cleaned_chapters_df.index)]))

In [None]:
cluster1 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 0] 
cluster2 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 1]
cluster3 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 2]
cluster4 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 3]
cluster5 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 4]
cluster6 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 5]
cluster7 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 6]
cluster8 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 7]
cluster9 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 8]
cluster10 = [j for i,j in zip(lda_corpus,cleaned_chapters_df.index) if i[0][0] == 9]

topic_clusters = [cluster1, cluster2, cluster3, cluster4, cluster5, cluster6, cluster7, cluster8, cluster9, cluster10]
print(cluster1)
print(cluster2)
print(cluster3)
print(cluster4)
print(cluster5)
print(cluster6)
print(cluster7)
print(cluster8)
print(cluster9)
print(cluster10)

In [None]:
import itertools
import networkx as nx
from gensim.matutils import jaccard

def get_most_likely_topic(model, doc):
    bow = model.id2word.doc2bow(doc)
    topics, probabilities = zip(*model.get_document_topics(bow))
    max_p = max(probabilities)
    topic = topics[probabilities.index(max_p)]
    return topic

In [None]:
bow = lda_model.id2word.doc2bow(chapters_bigrams_trigrams[28])
topics, probabilities = zip(*lda_model.get_document_topics(bow))
max_p = max(probabilities)
topic = topics[probabilities.index(max_p)]
topic

In [None]:
# Create topic-term matrix
ttm = pd.DataFrame(lda_model.get_topics(), columns=lda_model.id2word.values(), index=[f'topic {i+1}' for i in range(lda_model.num_topics)]).transpose()
values = [np.sort(arr)[::-1] for arr in ttm.values]

ttm = pd.DataFrame(values, ttm.index, ttm.columns)

# Get topic-term distributions
topic_term_dists = ttm.transpose().values
topic_term_dists

In [None]:
# Alternative way to create topic-term-matrix
topics_terms = lda_model.state.get_lambda() 
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
ttm = pd.DataFrame(topics_terms_proba, columns=lda_model.id2word.values(), index=[f'topic {i+1}' for i in range(lda_model.num_topics)]).transpose()
values = [np.sort(arr)[::-1] for arr in ttm.values]

ttm = pd.DataFrame(values, ttm.index, ttm.columns)
ttm


In [None]:
# Create document-term matrix
doc_term_matrix = [id2word.doc2bow(doc) for doc in chapters_bigrams_trigrams] # doc = chapter
doc_term_matrix

In [None]:
# (ADJUSTED DATA FORMAT BELOW!) Find top 30 words that represents each topic
top_terms_dict_per_topic = {}

for topic in ttm.columns:
    top = ttm[topic].sort_values(ascending=False).head(30)
    top_values_to_string = map(lambda x: str(x), top.values)
    top_terms_dict_per_topic[topic] = {"chapters": topic_clusters[int(topic[-1]) - 1], "terms": list(zip(top.index, top_values_to_string))}
    
top_terms_dict_per_topic

In [None]:
# ADJUSTED DATA FORMAT: Find top 30 words that represents each topic
top_terms_dict_per_topic_new = {}

for topic in ttm.columns:
    top = ttm[topic].sort_values(ascending=False).head(30)
    top_values_to_string = map(lambda x: str(x), top.values)
    key_value_arr = list(zip(top.index, top_values_to_string))
    top_terms_dict_per_topic_new[topic] = {"chapters": topic_clusters[int(topic[-1]) - 1], "terms": [{"key": key, "value": value} for key, value in key_value_arr]}
    
top_terms_dict_per_topic_new

In [None]:
save_data("top_terms_per_topic.json", top_terms_dict_per_topic_new)

In [None]:
# Find number of terms in the entire corpus that happen to belong to the top terms of each topic 
#--> the higher the frequency, the more frequent the topic is in the corpus

topic_terms_count = {}
for topic in top_terms_dict_per_topic:
    terms_of_topic = [i[0] for i in top_terms_dict_per_topic[topic]['terms']]
    count = 0
    for chapter in chapters_bigrams_trigrams:
        count += len([term for term in chapter if term in terms_of_topic])
    topic_terms_count[topic] = count

topic_terms_count

In [None]:
# Find term frequency
term_frequency = {}
for term in id2word.values():
    term_frequency[term] = 0
    for chapter in chapters_bigrams_trigrams:
        if term in chapter:
            term_frequency[term] += 1
term_frequency

In [None]:
def tf(corpus):
    dic={}
    for document in corpus:
        for word in document:
            if word in dic:
                dic[word] = dic[word] + 1
            else:
                dic[word]=1
    for word,freq in dic.items():
        dic[word]=freq/sum(map(len, (document for document in corpus)))
    return dic
tf(chapters_bigrams_trigrams)

### Dimensionality reduction

In [None]:
from __future__ import absolute_import
from past.builtins import basestring
from collections import namedtuple
import json
import logging
from joblib import Parallel, delayed, cpu_count
import numpy as np
import pandas as pd
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform
try:
    from sklearn.manifold import MDS, TSNE
    sklearn_present = True
except ImportError:
    sklearn_present = False

# pyLDAvis
def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

def _pcoa(pair_dists, n_components=2):
    """Principal Coordinate Analysis,
    aka Classical Multidimensional Scaling
    """
    # code referenced from skbio.stats.ordination.pcoa
    # https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py

    # pairwise distance matrix is assumed symmetric
    pair_dists = np.asarray(pair_dists, np.float64)

    # perform SVD on double centred distance matrix
    n = pair_dists.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    B = - H.dot(pair_dists ** 2).dot(H) / 2
    eigvals, eigvecs = np.linalg.eig(B)

    # Take first n_components of eigenvalues and eigenvectors
    # sorted in decreasing order
    ix = eigvals.argsort()[::-1][:n_components]
    eigvals = eigvals[ix]
    eigvecs = eigvecs[:, ix]

    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
    # at least 1 eigenvalue must be zero
    eigvals[np.isclose(eigvals, 0)] = 0
    if np.any(eigvals < 0):
        ix_neg = eigvals < 0
        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

    return np.sqrt(eigvals) * eigvecs

#Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling
def js_MMDS(distributions, **kwargs):
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)

In [None]:
topic_terms_count.values()

In [None]:
mmds_data = js_MMDS(list(lda_model.get_topics()))
mmds_data

In [None]:
# (ADJUSTED DATA FORMAT BELOW!) Create for each topic an object out of its frequency (= occurrences of its top terms) in corpus and MMDS-coordinates
topic_bubbles_data = {}
for i in range(num_topics):
    topic_bubbles_data[f'topic {i+1}'] = {"count": list(topic_terms_count.values())[i], "coordinates": list(mmds_data[i])}
topic_bubbles_data

In [None]:
# ADJUSTED DATA FORMAT:
topic_bubbles_data = {}
for i in range(num_topics):
    x_y_coords = list(mmds_data[i])
    x = x_y_coords[0]
    y = x_y_coords[1]
    topic_bubbles_data[f'topic {i+1}'] = {"count": list(topic_terms_count.values())[i], "x": x, "y": y}
topic_bubbles_data

In [None]:
save_data("topic_bubbles_data.json", topic_bubbles_data)

In [None]:
#(for 7 topics) Dimension reduction via Jensen-Shannon Divergence & Metric Multidimensional Scaling
mmds_data_7 = js_MMDS(list(lda_model_7.get_topics()))
mmds_data_7

In [None]:
# For 7 topics:
topic_terms_count_7 = {}
for topic in top_terms_arr_per_topic_7:
    terms_of_topic = [i['key'] for i in topic['terms']]
    print(terms_of_topic)
    count = 0
    for chapter in chapters_bigrams_trigrams:
        count += len([term for term in chapter if term in terms_of_topic])
    topic_name = "topic " + str(topic['topic'])
    topic_terms_count_7[topic_name] = count

#topic_terms_count_7

In [None]:
# For 7 topics:
topic_bubbles_data_7 = {}
num_topics = 7
for i in range(num_topics):
    x_y_coords = list(mmds_data_7[i])
    x = x_y_coords[0]
    y = x_y_coords[1]
    topic_bubbles_data_7[f'topic {i+1}'] = {"count": list(topic_terms_count_7.values())[i], "x": x, "y": y}
topic_bubbles_data_7

In [None]:
save_data("topic_bubbles_data_7.json", topic_bubbles_data_7)

In [None]:
# Test - Plot js_MMDS results
plt.figure(figsize=(10,5))
plt.scatter(mmds_data[:, 0], mmds_data[:, 1])

for i, data in enumerate(zip(mmds_data[:, 0],mmds_data[:, 1])):
    x = data[0]
    y = data[1]
    label = f"Topic {i+1} ({round(x,2)}, {round(y,2)})"

    plt.annotate(label, 
                 (x, y), # coordinates to position the label
                 textcoords="offset points", # how to position the label
                 xytext=(0,10), # distance from label to points (x,y)
                 ha='center') 

In [None]:
# Test - Dimension reduction via PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(list(lda_model.get_topics()))
vis_data = pca_data

In [None]:
len(list(lda_model.get_topics()))

In [None]:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
#vis=pyLDAvis.gensim_models.prepare(lda_model,corpus,id2word)

vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

In [None]:
# Get topic distribution per chapter
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats

color = []
for corpus_line in corpus:
    sorted_topic_line = list(sorted(lda_model[corpus_line], key=lambda x: x[1], reverse=True))
    color.append(sorted_topic_line[0][0])
    
    
lda_output = []
for line in corpus:
    lda_output.append(lda_model[line])
    
topics_data = np.zeros(shape=(38,10))

for i, line in enumerate(lda_output):
    for topic_line in line:
            topics_data[i][topic_line[0]] = topic_line[1]
print(topics_data)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(topics_data)
pca_data

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=color)

for x,y in zip(pca_data[:, 0],pca_data[:, 1]):

    label = f"({round(x,2)}, {round(y,2)})"

    plt.annotate(label, 
                 (x, y), 
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center') 