# Textual analysis of "The Secret History"

## Import packages

In [1]:
import requests
import re
import tqdm
import os
import string
import operator

from os.path import join

import nltk

from nltk import word_tokenize,sent_tokenize,pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

stopwords = stopwords.words('english')

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
ana = SentimentIntensityAnalyzer()


import warnings
warnings.filterwarnings("ignore")


import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('wordnet')
nltk.download('sentiwordnet')

def remove_punctuation(words):
    new_list= []
    for w in words:
        if w.isalnum():
            new_list.append( w )
    return new_list

def ptb_to_wordnet(PTT):

    if PTT.startswith('J'):
        ## Adjective
        return 'a'
    elif PTT.startswith('V'):
        ## Verb
        return 'v'
    elif PTT.startswith('N') and not PTT.startswith('NNP'):
        ## Noune
        return 'n'
    elif PTT.startswith('R'):
        ## Adverb
        return 'r'
    else:
        return ''

def wordnet_hypernyms(token):
    all_hypernyms = []

    word_senses = wn.synsets(token)

    hypernyms = lambda s: s.hypernyms()

    for ws in word_senses:

        hypernyms = [hyp.name() for hyp in list(ws.closure(hypernyms))]
        for h in hypernyms:
            all_hypernyms.append(h[0:h.index('.')])

    return all_hypernyms

def intersection(list1,list2):
    return list(set(list1) & set(list2))


def collocation( text , regex , distance ):

    freq_c = dict()

    sentences = sent_tokenize( text )

    for sentence in sentences:

        words = word_tokenize( sentence )
        words = remove_punctuation(words)

        for i,w in enumerate(words):
            if re.search( regex , w , re.IGNORECASE ):
                index_regex = i 

                for x in range( i - distance , i + distance ):
                    if x >= 0 and x < len(words) and words[x].lower() != words[index_regex].lower():
                        if len(words[x]) > 0:
                            word = words[x].lower()
                            freq_c[ word ] = freq_c.get( word , 0 ) + 1
            
    return freq_c


ParseError: mismatched tag: line 9, column 2 (<string>)

## Create a lemmatised version of the book

In [None]:
lemmatiser = WordNetLemmatizer()

with open('secret_history.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    lemmatised = ''
    words = word_tokenize(full_text)
    words = remove_punctuation(words)
    pos = nltk.pos_tag(words)

    for i,word in enumerate(words):

        wn_pos = ptb_to_wordnet( pos[i][1] )

        if re.search( r'\w+' , wn_pos , re.IGNORECASE ):
            lemma = lemmatiser.lemmatize( words[i] , wn_pos )
            lemmatised += f' {lemma.lower()} '
                
        else:
            lemmatised += f' {word.lower()}' 
            
with open('secret_history_lemmatised.txt','w',encoding='utf-8') as out:
    out.write(lemmatised)
            

## Create an XML version with chapter divisions

In [None]:
with open('secret_history.xml','w',encoding='utf-8') as out:
    out.write('<novel>\n<div>')
    sections = re.split('\n',full_text)
    
    for section in sections:
        if re.search('(Chapter)|(Prologue)|(Epilogue)',section):
            section = re.sub('','',section)
            out.write('</div>\n<div>')
            out.write('<title>')
            section = re.sub('\^','',str(section))
            out.write(section.strip())
            out.write('</title>')
        else:
            out.write(section)
    out.write('</div>\n</novel>')

## Most frequent words

In [None]:
freq = Counter()

with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    
words = word_tokenize(full_text)
words = remove_punctuation(words)
words = [word for word in words if word not in stopwords]
freq = Counter(words)
    
for word,count in freq.most_common(50):
    print(f'{word} ({count})')

## Most frequent adjectives, adverbs, nouns

In [None]:
relevant_tags = ['JJ','JJR','JJS','NN','NNS','RB','RBR','RBS']
black_list = ['i','julian','francis','henry','charles','camilla','corcoran']

data = []

with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    sentences = sent_tokenize(full_text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = remove_punctuation(words)
        pos_tags = nltk.pos_tag(words)
        for word_code in pos_tags:
            if word_code[1] in relevant_tags and word_code[0] not in black_list:
                row = []
                row.append(word_code[0])
                row.append(word_code[1])
                data.append(row)

words = pd.DataFrame(data,columns=['word','code'])


In [None]:
adjectives = words.query( f'(code=="JJR") or (code=="JJS") or (code=="JJ")' )
adjectives_freq = Counter( adjectives['word'].tolist() )

for word,count in adjectives_freq.most_common(50):
    print(f'{word} ({count})')

In [None]:
adverbs = words.query( f'(code=="RB") or (code=="RBR") or (code=="RBS")' )
adverbs_freq = Counter( adverbs['word'].tolist() )

for word,count in adverbs_freq.most_common(50):
    print(f'{word} ({count})')

In [None]:
nouns = words.query( f'(code=="NN") or (code=="NNS")' )
nouns_freq = Counter( nouns['word'].tolist() )

for word,count in nouns_freq.most_common(50):
    print(f'{word} ({count})')

## Which adjectives are used to describe 'educational institutions'?

In [None]:
school_words = []

with open('secret_history.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    words = word_tokenize(full_text)
    for word in words:
        if 'educational_institution' in wordnet_hypernyms(word) or 'educator' in wordnet_hypernyms(word):
            school_words.append(word.lower())

school_words = list(set(school_words))
print('Educational institutions:')
print(school_words)

school_words = [f'({token})' for token in school_words]

regex = '|'.join(school_words)

print('\nWords used in the context:')
freq = collocation(full_text,regex,5)
sorted_freq = sorted(freq.items(), key=operator.itemgetter(1),reverse=True)

for word,count in sorted_freq:
    if word not in stopwords:
        print(word)    

## Find sentences containing synonyms of the word 'dark'

In [None]:
synonyms = []

for ss in wn.synsets('dark'):
    synonyms.extend(ss.lemma_names())
    
synonyms = list(set(synonyms))
print(synonyms)

In [None]:
freq = Counter()
total_nr_words = 0

with open('secret_history.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    sentences = sent_tokenize(full_text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        total_nr_words += len(words)
        if len(intersection(words,synonyms))>0:
            freq.update( intersection(words,synonyms) )
            sentence = re.sub('\n+',' ',sentence)
            print(sentence)

In [None]:
for word,count in freq.most_common():
    print(f'{word} ({count})')

In [None]:
print( f'{(sum(freq.values())/total_nr_words)*100}% of the words in the novel are a synponym of "dark"')

## Clothing

References to tweed jackets?

In [None]:
freq = Counter()

with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    words = word_tokenize(full_text)
    
relevant_categories =  ['clothing','fabric']
    
for word in words:
    hypernyms = wordnet_hypernyms(word)
    if len(intersection(hypernyms,relevant_categories))>0:
        freq.update([word])

for word,count in freq.most_common(50):
    print(f'{word} ({count})')


## Substances

In [None]:
freq = Counter()

with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    full_text = fh.read()
    words = word_tokenize(full_text)
    
relevant_categories =  ['narcotic','drug']
    
for word in words:
    hypernyms = wordnet_hypernyms(word)
    if len(intersection(hypernyms,relevant_categories))>0:
        freq.update([word])

for word,count in freq.most_common(50):
    print(f'{word} ({count})')


## Lexicon

In [None]:
dir = 'Lexicon'
if not os.path.isdir(dir):
    os.mkdir(dir)
    
base_url = 'https://raw.githubusercontent.com/peterverhaar/dark_academia/refs/heads/main/Lexicon/'

lexicon_files = [
    'academia.txt',
    'literature_and_culture.txt',
    'mood.txt',
    'objects.txt',
    'spaces.txt'
]
    

for l in lexicon_files:
    topic = l[ : l.rindex('.') ]
    response = requests.get( base_url + l)
    words = []
    if response:
        response.encoding = 'utf-8'
        out = open( os.path.join( dir , l ) , 'w' , encoding = 'utf-8' )
        out.write( response.text )
        out.close()

print('Lexicons have been downloaded!')



lexicons = dict()


for file in os.listdir(dir):
    if re.search(r'txt$',file):
    
        topic = re.sub( r'\.txt$','',file )
        words = []

        with open( join(dir,file) , encoding = 'utf-8' ) as file_handler:   
            for l in file_handler: 
                if re.search( r'\w' , l ):
                    words.append(l.strip().lower())

        lexicons[topic] = words    


In [None]:
import os
from os.path import join

from tdmh import *
from nltk.stem import WordNetLemmatizer


csv = open( 'lexicon.csv' , 'w' , encoding = 'utf-8' )

## print header
csv.write('category,count\n')


with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    
    lemmatised = fh.read()                
    words = word_tokenize(lemmatised)
    words = remove_punctuation(words)
    freq = Counter(words)
    tokens = len(lemmatised)

    for l in lexicons:
        print(f'{l} ...')   
        csv.write(f'{l},')

        count_occurrences = 0
        for word in l:
            count_occurrences += freq.get(word.lower(),0)
        csv.write( f'{ count_occurrences / tokens}\n' )

    csv.write('\n')

csv.close()

print("Done!")



In [None]:


words = []
with open('secret_history_lemmatised.txt',encoding='utf-8') as fh:
    
    lemmatised = fh.read()                
    words = word_tokenize(lemmatised)
    words = remove_punctuation(words)
    
for l in lexicons:
    lexicon_freq = Counter()
    print(f'\n{l}')   
    for word in words:
        if word.lower() in lexicons[l]:
            lexicon_freq.update([word])
    for word,count in lexicon_freq.most_common(20):
        print(f"{word} ({count})")
   


In [None]:
%matplotlib inline




fig = plt.figure( figsize=( 7 ,6 ) )
ax = plt.axes()

x = 'category'
y = 'count'

bar_width = 0.45
opacity = 0.8

ax.bar( df[x] , df[y] , width = bar_width, alpha = opacity , color = '#fcc11c')

plt.xticks(rotation= 75)

ax.set_xlabel('Category' , fontsize= 12)
ax.set_ylabel('Relative frequency' , fontsize = 12 )
ax.set_title( y.title() , fontsize=20 )


plt.show()

## Words in other domains

In [None]:
selected_domains = [
    
 'psychological_feature',
 'situation',
 'state',
 'location',
 'idea',
 'illumination',
 'natural_object',
 'building',
 'dwelling',
 'housing',
 'physical_phenomenon',
 'natural_phenomenon',
 'educational_institution',
 'social_group',
 'bedroom_furniture',
 'achromatic_color',
 'cognitive_state',
 'psychological_state',
 'condition',
 'emotion',
 'expressive_style',
 'college_student',
 'certificate',
    'color']



In [None]:
data = []

fh = open('secret_history_lemmatised.txt',encoding='utf-8')
full_text = fh.read()
words = word_tokenize(full_text.lower())
words = remove_punctuation(words)
for word in words:
    hypernyms = wordnet_hypernyms(word)
    for h in hypernyms:
        if h in selected_domains:
            row = []
            row.append(word)
            row.append(h)
            data.append(row)
            
domains_df = pd.DataFrame(data,columns=['word','domain'])

In [None]:
domain = 'color'

words = domains_df.query( f'domain == "{domain}"')
words_freq = Counter(list(words['word']))
for word,count in words_freq.most_common(20):
    print(f'{word} ({count})')

In [None]:
domain = 'emotion'

words = domains_df.query( f'domain == "{domain}"')
words_freq = Counter(list(words['word']))
for word,count in words_freq.most_common(20):
    print(f'{word} ({count})')

In [None]:
domain = 'educational_institution'

words = domains_df.query( f'domain == "{domain}"')
words_freq = Counter(list(words['word']))
for word,count in words_freq.most_common(20):
    print(f'{word} ({count})')

# Sentences containing hypernyms

In [None]:
specific_domains = ['emotion']
all_sentences = []

with open('secret_history.txt',encoding='utf') as fh:
    full_text = fh.read()
    sentences = sent_tokenize(full_text)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = remove_punctuation(words)
        for word in words:
            hypernyms = wordnet_hypernyms(word)
            intersection = list(set(hypernyms) & set(specific_domains))
            if len(intersection) > 0:
                sentence = re.sub('\n',' ',sentence)
                all_sentences.append(sentence.strip())
                break

In [None]:
print(f'The novel contains {len(sentences)} sentences.')
print(f'{len(all_sentences)} sentences were selected.')

for sentence in all_sentences[:15]:
    print(sentence+'\n')

In [None]:
## Sentiment analysis

In [None]:
## Named entity recognition