# Getting "context chunks"

For a specified character index range, let's retrieve some of the context left and right of that range.


# Initial setup

In [112]:
import pandas as pd
from text_matcher.matcher import Text, Matcher
from IPython.display import clear_output

In [113]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Woolf/1925_Dalloway/Results/Woolf_1925_Dalloway_results_t2-c3-n2-m3-nostops.jsonl"

In [114]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Woolf
Publication year: 1925
Text title: Dalloway
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [115]:
# Load source text

with open(f"{sourceDir}/{projectName}_plaintext.txt") as f: 
    rawText = f.read()

mm = Text(rawText, 'Middlemarch')

In [117]:
# Load in the JSONL file with the full text of JSTOR articles

corpusDF = pd.read_json(f"{corpusDir}/{projectName}_fulltext.jsonl", lines=True)
print(f"Loaded {len(corpusDF)} full-text items")

# Create new dataframe with just full text and id
fulltextDF = corpusDF[["fullText","id"]]
del corpusDF

Loaded 2495 full-text items


In [118]:
# Load in the JSONL file with the results of text matcher

resultsDF = pd.read_json(f"{startData}", lines=True)
print(f"Loaded {len(resultsDF)} results from text-matcher")

Loaded 2495 results from text-matcher


In [119]:
df = pd.merge(fulltextDF, resultsDF, on="id")

In [120]:
# Free up memory from initial separate dataframes

del fulltextDF
del resultsDF

In [121]:
# Drop items with no matches from results dataframe

print(f"Total number of items from JSTOR: {len(df)}")
df = df[df['numMatches']>=1]
print(f"Total number of items with at least one quotation detected: {len(df)}")

Total number of items from JSTOR: 2495
Total number of items with at least one quotation detected: 518


In [122]:
# Expand the dataframe so each quotation becomes its own row

df = df.explode(['Locations in A', 'Locations in B'])
print(f"Total number of quotations detected: {len(df)}")

Total number of quotations detected: 1758


In [123]:
# Replace old indices with new indices
df = df.reset_index()

# OPTIONAL: filter results down to specific passage

In [None]:
# Define a function that can compare two intervals in the form of listed tuples, eg [x0, x1], [y0,y1]
def overlaps(range1, range2):
    try:
        if len(range1) == 2 and len(range2) == 2:
            return max(range1[0],range2[0]) < min(range1[1],range2[1])
        else:
            return False
    except TypeError:
        return False

In [None]:
# This cell specifies the start and end indices of the passage 

passageName = "romanticSide"
quoteIndex = [4375, 4407]

In [None]:
df["doesOverlap"] = df.apply(lambda x: overlaps(quoteIndex, x['Locations in A']), axis=1)


In [None]:
df = df[df["doesOverlap"]]
df = df.reset_index()

# Generate context chunks

In [124]:
# Specify chunk size left and right
num_characters_before_quote = 250
num_characters_after_quote = 750

In [105]:
# Loop over each of the quotation start and end locations to produce
# left and right context chunks of specified sizes

allChunksLeft = []
allChunksRight = []

for item in range(len(df)):
    article_URL = df['id'].iloc[item]
    startandEndLocations = df['Locations in B'].iloc[item]
    article_index = df[df['id'] == article_URL].index[0]
    article_text = df['fullText'].iloc[article_index]

    cleaned_article_text = Text(article_text, article_title)

    clear_output()
    print(f"Now extracting context chunks for quotation {item + 1} of {len(df)}")
    
    allChunksLeft.append((cleaned_article_text.text[startandEndLocations[0]-num_characters_before_quote:startandEndLocations[0]]))
    allChunksRight.append((cleaned_article_text.text[startandEndLocations[1]:startandEndLocations[1]+num_characters_after_quote]))

Now extracting context chunks for quotation 1758 of 1758


In [108]:
dfLeft = pd.DataFrame(allChunksLeft, columns=['contextChunksLeft'])
dfRight = pd.DataFrame(allChunksRight, columns=['contextChunksRight'])

In [125]:
# Merge left and right context chunks into main dataframe
df = pd.merge(df, dfLeft, left_index=True, right_index=True)
df = pd.merge(dfTest, dfRight, left_index=True, right_index=True)

0       Margaret Harris "THE FRATERNITY OF OLD LAMPS":...
1       Margaret Harris "THE FRATERNITY OF OLD LAMPS":...
2       Margaret Harris "THE FRATERNITY OF OLD LAMPS":...
3       Margaret Harris "THE FRATERNITY OF OLD LAMPS":...
4       [Authority and Invention in the Fiction of Bes...
                              ...                        
1753    ['DIE MORAL VON DER GESCHICHT': ART AND ARTIFI...
1754    The Modernist Inkblot The Modernist Inkblot Em...
1755    The Modernist Inkblot The Modernist Inkblot Em...
1756    The Modernist Inkblot The Modernist Inkblot Em...
1757    [INDIVIDUAL AUTHORS PETER ABRAHAMS C Chiwengo,...
Name: fullText, Length: 1758, dtype: object

In [128]:
# Drop full text column
df = df.drop("fullText", axis=1)

In [129]:
# Save as JSONL file
df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}_contextChunks.jsonl", orient='records', lines=True)

# Generate word frequency list for all context chunks