In [25]:
import nltk
import os
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from WordBank import word_bank



[nltk_data] Downloading package punkt to
[nltk_data]     /home/malvinaclavering/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/malvinaclavering/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
with open("BooksCleaned/Book46.txt") as book:
    contents = book.read()
    #print(contents)
    tokens = nltk.word_tokenize(contents)
    #print(tokens)
    parts_of_speech = nltk.pos_tag(tokens)
    #print(parts_of_speech)

In [4]:
import nltk
from nltk.corpus import wordnet   #Import wordnet from the NLTK
def find_syn(keyword):    
    syn = list()
    for synset in wordnet.synsets(keyword):
        for lemma in synset.lemmas():
            syn.append(lemma.name())    #add the synonyms
    #print('Synonyms: ' + str(syn))
    return str(syn)

In [5]:
def find_occurances_of_keyword(keyword,text):
    list_of_indexes=[]
    for index,word in enumerate(text):
        if keyword == word:
            list_of_indexes.append(index)
    return list_of_indexes

In [6]:
def find_adjectives(indexes, parts_of_speech_list):
    list_of_adjectives=[]
    for index in indexes:
        if parts_of_speech_list[index-1][1]=="JJ" or \
        parts_of_speech_list[index-1][1]=="JJR" \
        or parts_of_speech_list[index-1][1]=="JJS":
            list_of_adjectives.append(parts_of_speech_list[index-1][0])
    return list_of_adjectives

In [7]:
def find_words_that_describe_keyword(keyword,path):
    with open(path) as book:
        contents = book.read()
        tokens = nltk.word_tokenize(contents)
        parts_of_speech = nltk.pos_tag(tokens)
    keyword_indexes=find_occurances_of_keyword(keyword,tokens)
    adjectives=find_adjectives(keyword_indexes,parts_of_speech)
    return adjectives
find_words_that_describe_keyword("gentleman", "BooksCleaned/Book922.txt")

['old', 'old', 'old']

In [8]:
def find_words_in_all_books_that_describe_keyword(keyword):
    adj=[]
    for book in os.listdir("BooksCleaned"):
        adj+=(find_words_that_describe_keyword(keyword,f'BooksCleaned/{book}')) 
    return(adj)
#adj_seamstress=find_words_in_all_books_that_describe_keyword("seamstress")

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
#def sentiment_countifier(adjectives):
#    sia = SentimentIntensityAnalyzer()
#    str1 = " " 
#    words =str1.join(adjectives)    
#    print(words)
#    sentiments = sia.polarity_scores(words)
#    return sentiments
#print(sentiment_countifier(adj_lawyer))
def sentiment_countifier_individual(adjectives):
    sia = SentimentIntensityAnalyzer()
    pos_scores=[]    
    neu_scores=[]    
    neg_scores=[] 
    for word in adjectives:    
        sentiments = sia.polarity_scores(word)
        print(sentiments)
        pos_scores.append(sentiments["pos"])
        neu_scores.append(sentiments["neu"])
        neg_scores.append(sentiments["neg"])
    pos_score= sum(pos_scores) / len(pos_scores)
    neu_score= sum(neu_scores) / len(neu_scores)
    neg_score= sum(neg_scores) / len(neg_scores)    
    return [pos_score,neu_score,neg_score]
#print(sentiment_countifier_individual(adj_lawyer))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/malvinaclavering/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [26]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
def find_sentences_with_keyword(keyword, text):
    """
    Find the sentence that contextualizes the occurence of a keyword and return a list
    of indices that represent the range of each sentence containing the keyword.
    
    Args:
        keyword: a string representing a word to contextualize in a sentence.
        text: a tokenized text given as a list of strings of words and punctuation.
        
    Returns:
        A list of strings that are sentences containing the keyword.
    """
    keyword_locations = find_occurances_of_keyword(keyword, text)
    end_punctuation = [".", "!", "?"]
    sentence_ranges = []
    for i in keyword_locations:
        next_word = i + 1
        while text[next_word] not in end_punctuation:
            next_word += 1
        end_location = next_word + 1
        previous_word = i-1
        while text[previous_word] not in end_punctuation:
            previous_word -= 1
        start_location = previous_word + 1
        if (start_location, end_location) not in sentence_ranges:
            sentence_ranges.append((start_location, end_location))
    sentences=[]
    for range_ in sentence_ranges:
        new_sentence=TreebankWordDetokenizer().detokenize((text[range_[0]:range_[1]]))
        sentences.append(new_sentence.replace(" ’ ", "’"))
    return sentences


In [27]:
import spacy
def look_for_adjectives(word,sentence):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    adjectives=[]
    for token in doc:
        #print(token.text +" "+token.dep_+" "+token.head.text+" "+token.pos_)
        working_token=token
        while working_token.dep_ != "ROOT":
            if working_token.head.text==word and (token.pos_==("ADJ")):
                adjectives.append(token.text)
                break
            if working_token.head.text==working_token.text:
                break
            working_token=working_token.head
    return adjectives



In [28]:
def find_adj_in_all_sentences(keyword,path):
    with open(path) as book:
        contents = book.read()
        text = nltk.word_tokenize(contents)
        #parts_of_speech = nltk.pos_tag(tokens)
    adj=[]
    for item in find_sentences_with_keyword(keyword, text):
        adj+=look_for_adjectives(keyword,item)
    return adj


In [29]:
def expand_keywords(keywords_):
    new_keywords=keywords_[:]
    for word in keywords_:
        new_keywords.append(word+"s")
        new_keywords.append(word.capitalize())
        new_keywords.append((word+"s").capitalize())
    return new_keywords


In [31]:
def find_adj_in_all_books(keywords):
    keywords=expand_keywords(keywords)
    adj=[]
    for word in keywords:
        for book in os.listdir("BooksCleaned"):
            adj+=(find_adj_in_all_sentences(word,f'BooksCleaned/{book}')) 
        return(adj)


In [33]:
def find_adj_all_words_all_books(wordbank):
    adj_list = []
    for list in wordbank:
        adj_list.append(find_adj_in_all_books(list))
    return adj_list

In [25]:
print(find_adj_all_words_all_books(word_bank[:3]))

[['nice', 'unfortunate', 'short', 'rusty', 'able', 'able', 'pleasant', 'able', 'great', 'imperturbable', 'shrewd', 'damp', 'fine', 'English', 'English', 'old', 'English', 'pertinacious', 'Swiss', 'certain', 'large', 'mysterious', 'great'], ['faded', 'bereaved', 'strange', 'dear', 'other', 'other', 'young', 'old', 'old', 'old', 'old', 'old', 'old', 'old', 'wholesome', 'own', 'little', 'more', 'monthly', 'little', 'old', 'elder', 'professional', 'small', 'little', 'fond', 'faithful', 'other', 'dear', 'old', 'much', 'dear', 'old', 'other', 'dear', 'old', 'much', 'other', 'Catholic', 'Catholic', 'best', 'kindest', 'little', 'good', 'dear', 'old', 'tender', 'little', 'little', 'little', 'kind', 'old', 'Kind', 'gentlest', 'best', 'whole', 'monthly', 'sympathetic', 'remorseless', 'unsympathetic', 'sick', 'poor', 'new', 'insalubrious', 'flabby', 'boned', 'coarse', '’The', 'old', 'old', 'minded', 'old', 'old', 'dead', 'long', 'gentle', 'little', 'unconscious', 'little', 'other', 'dear', 'old', 

In [30]:
def write_adj_list_to_file(wordbank, filename):
    adj_list = find_adj_all_words_all_books(wordbank)
    with open(f'Outputs/{filename}.txt', 'w') as f:
        for item in adj_list:
            f.write(str(item) + "\n")

In [31]:
write_adj_list_to_file(word_bank, "all_adjectives_expanded")