In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.text import Text
import os

In [2]:
texts = ['Emma', 
         'Northanger_Abbey',
         'Persuasion',
         'Mansfield_Park',
         'Pride_and_Prejudice',
         'Sense_and_Sensibility']

# Concordance

In [4]:
path = os.path.join('Corpus',f'{texts[0]}.txt')

with open( path , encoding = 'utf-8') as file:
    full_text = file.read()

tokens = word_tokenize(full_text)
novel = Text(tokens)

In [6]:
novel.concordance('marriage' , width = 50 , lines = 10)

Displaying 10 of 33 matches:
ce of her sister ’ s marriage , been mistress of h
ollowed Isabella ’ s marriage , on their being lef
e knows how much the marriage is to Miss Taylor ’ 
 to bring about this marriage . A worthy employmen
e dissuaded from the marriage , and it took place 
fter a three years ’ marriage , he was rather a po
 , even in his first marriage ; but his second mus
 upon his father ’ s marriage , it was very genera
ratulation which her marriage had already secured 
ince Mrs. Weston ’ s marriage her exercise had bee


In [7]:
import math
import re
from tdmh import *

def concordance_word( text, regex , width = 10 ):

    concordance = []
    distance = math.floor( width /2 )

    segment_length = 0

    words = word_tokenize( text )
    words = remove_punctuation( words )
    i = 0
    for w in words:
        if re.search( regex , w , re.IGNORECASE ):
            match = ''
            for x in range( i - distance , ( i + distance ) + 1 ):
                if x >= 0 and x < len(words):
                    if len(words[x]) >= 0:
                        match += words[x] + ' '
            concordance.append( match )

        i += 1

    return concordance

In [8]:
with open( path , encoding = 'utf-8') as file:
    full_text = file.read()
    
fragments = concordance_word( full_text , r'marriage' , 16)

print( f'There are {len(fragments)} ocurrences of the word "marriage".')

number_of_results = 5

print( f'Here are the first {number_of_results} occurrences:\n\n')
for f in fragments[:number_of_results]:
    print( f'{f}\n')

There are 34 ocurrences of the word "marriage".
Here are the first 5 occurrences:


and had in consequence of her sister s marriage been mistress of his house from a very 

perfect unreserve which had soon followed Isabella s marriage on their being left to each other was 

suppose it but she knows how much the marriage is to Miss Taylor s advantage she knows 

the last four years to bring about this marriage A worthy employment for a young lady s 

to the not to be dissuaded from the marriage and it took place to the infinite mortification 



In [10]:
fragments = concordance_word(full_text , r'(\bhates?\b)|(\bloves?\b)' , 25)

for f in fragments:
    print( f'{f}\n')

you What a horrible idea Oh no I meant only myself Knightley loves to find fault with me you a is all a joke We 

Miss Churchill of a great Yorkshire family and Miss Churchill fell in love with him nobody was surprized except her brother and his wife who 

due to her in return for the great goodness of being in love with him but though she had one sort of spirit she had 

it was nothing in comparison of Enscombe she did not cease to love her husband but she wanted at once to be the wife of 

intellectual superiority to make atonement to herself or frighten those who might hate her into outward respect She had never boasted either beauty or cleverness 

through the fluctuations of this speech and saw no alarming symptoms of love The young man had been the first admirer but she trusted there 

with her person he replied I think her all you describe I love to look at her and I will add this praise that I 

keep my spleen to myself till Christmas brings John and Isabella John loves Emma wi

In [11]:
def collocation( text , regex , width ):

    freq_c = dict()
    distance = math.floor(width/2)

    sentences = sent_tokenize( text )

    for sentence in sentences:

        words = word_tokenize( sentence )
        words = remove_punctuation(words)

        for i,w in enumerate(words):
            if re.search( regex , w , re.IGNORECASE ):
                index_regex = i 

                for x in range( i - distance , i + distance ):
                    if x >= 0 and x < len(words) and words[x].lower() != words[index_regex].lower():
                        if len(words[x]) > 0:
                            word = words[x].lower()
                            freq_c[ word ] = freq_c.get( word , 0 ) + 1
            
    return freq_c

In [13]:
nearby_words = collocation( full_text , r'marriage' , 20)

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

nearby_words_sorted = sortedByValue( nearby_words , ascending = False)

for word in list( nearby_words_sorted.keys() ):
    freq = nearby_words_sorted[word]
    if word not in stopwords and freq > 1:
        print( f'{word} => {freq}')

every => 5
since => 4
miss => 4
father => 4
till => 3
martin => 3
must => 3
much => 3
isabella => 3
soon => 3
emma => 2
might => 2
elton => 2
never => 2
however => 2
hartfield => 2
find => 2
event => 2
offer => 2
smith => 2
harriet => 2
quite => 2
proposal => 2
weston => 2
year => 2
proposed => 2
secured => 2
first => 2
man => 2
place => 2
years => 2
knows => 2
yet => 2


# Coorccurrence

In [14]:
def cooccurrence( text , word1 , word2 , width ):
    
    relevant_sentences = []
    
    text = re.sub( '\s+' , ' ' , text )
    sentences = sent_tokenize( text )

    for s in sentences:
        if re.search( r'\b' + word1 + r'\b' , s , re.IGNORECASE ) and re.search( r'\b' + word2 + r'\b' , s , re.IGNORECASE ):

            words = word_tokenize(s)
            word1_indexes = []
            word2_indexes = []
            
            for i,w in enumerate(words):
                if re.search( r'\b' + word1 + r'\b' , w , re.IGNORECASE ):
                    word1_indexes.append(i)
                elif re.search( r'\b' + word2 + r'\b' , w , re.IGNORECASE ):
                    word2_indexes.append(i)

            if word1_indexes[0] > word2_indexes[0]:
                difference = word1_indexes[0] - word2_indexes[0]
            else:
                difference = word2_indexes[0] - word1_indexes[0]

            if difference <= width:
                relevant_sentences.append(s)
    return relevant_sentences
                       

In [15]:
sentences = cooccurrence( full_text , 'marriage' , 'lydia' , 10 )

for s in sentences:
    print( f'{s}\n')