# Getting "context chunks"

For a specified character index range, let's retrieve some of the context left and right of that range.


# Initial setup

In [1]:
import pandas as pd
from text_matcher.matcher import Text, Matcher
from IPython.display import clear_output

In [2]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Dickens/1853_BleakHouse/Results/Dickens_1853_BleakHouse_results_t2-c3-n2-m3-nostops.jsonl"

In [3]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Dickens
Publication year: 1853
Text title: BleakHouse
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [4]:
# Load source text

with open(f"{sourceDir}/{projectName}_plaintext.txt") as f: 
    rawText = f.read()

mm = Text(rawText, 'Middlemarch')

In [5]:
# Load in the JSONL file with the full text of JSTOR articles

fulltextDF = pd.read_json(f"{corpusDir}/{projectName}_fulltext.jsonl", lines=True)
print(f"Loaded {len(fulltextDF)} full-text items")

Loaded 39549 full-text items


In [6]:
# Load in the JSONL file with the results of text matcher

resultsDF = pd.read_json(f"{startData}", lines=True)
print(f"Loaded {len(resultsDF)} results from text-matcher")

Loaded 39548 results from text-matcher


In [7]:
df = pd.merge(fulltextDF, resultsDF, on="id")

In [8]:
# Free up memory from initial separate dataframes

del fulltextDF
del resultsDF

In [9]:
# Drop items with no matches from results dataframe

print(f"Total number of items from JSTOR: {len(df)}")
df = df[df['numMatches']>=1]
print(f"Total number of items with at least one quotation detected: {len(df)}")

Total number of items from JSTOR: 39548
Total number of items with at least one quotation detected: 3969


In [10]:
# Expand the dataframe so each quotation becomes its own row

df = df.explode(['Locations in A', 'Locations in B'])
print(f"Total number of quotations detected: {len(df)}")

Total number of quotations detected: 6348


In [11]:
# Replace old indices with new indices
df = df.reset_index()

# OPTIONAL: filter results down to specific passage

In [None]:
# Define a function that can compare two intervals in the form of listed tuples, eg [x0, x1], [y0,y1]
def overlaps(range1, range2):
    try:
        if len(range1) == 2 and len(range2) == 2:
            return max(range1[0],range2[0]) < min(range1[1],range2[1])
        else:
            return False
    except TypeError:
        return False

In [None]:
# This cell specifies the start and end indices of the passage 

passageName = "romanticSide"
quoteIndex = [4375, 4407]

In [None]:
df["doesOverlap"] = df.apply(lambda x: overlaps(quoteIndex, x['Locations in A']), axis=1)


In [None]:
df = df[df["doesOverlap"]]
df = df.reset_index()

# Generate context chunks

In [12]:
# Specify chunk size left and right
num_characters_before_quote = 250
num_characters_after_quote = 750

In [13]:
# Create an empty list that we will populate with the contexts for quotations
context_chunks_for_quotations = []
context_chunks_ids = []

# Loop over each of the quotation start and end locations to produce
# left and right context chunks of specified sizes

for item in range(len(df)):
    article_URL = df['id'].iloc[item]
    startandEndLocations = df['Locations in B'].iloc[item]
    article_index = df[df['id'] == article_URL].index[0]
    article_title = df['title_y'].iloc[article_index]
    article_journal = df['isPartOf_x'].iloc[article_index]
    article_text = df['fullText'].iloc[article_index]

    cleaned_article_text = Text(article_text, article_title)

    clear_output()
    print(f"Now extracting context chunks for quotation {item + 1} of {len(df)}")
    
    context_chunks_for_quotations.append((cleaned_article_text.text[startandEndLocations[0]-num_characters_before_quote:startandEndLocations[0]]) + "[...]" + (cleaned_article_text.text[startandEndLocations[1]:startandEndLocations[1]+num_characters_after_quote]))
    context_chunks_ids.append(df['id'].iloc[item])

Now extracting context chunks for quotation 6348 of 6348


In [None]:
print(context_chunks_for_quotations)
#len(context_chunks_ids)

In [15]:
# Write output to a single text file

output_file = open(f'{resultsDir}/{projectName}-quotation-contexts.txt', mode='w', encoding='utf-8')

for context in context_chunks_for_quotations:
     output_file.write(context)
     output_file.write('\n')
output_file.close()

# Generate word frequency list for all context chunks