# Retroactively add quoted passages and context chunks to existing results

This Notebook is a temporary one to retroactively add:
- quoted passages in A
- quoted passages in B
- context chunks left and right of quoted passage in B

... to any results JSONL file that was generated without these data.

Once all JSONL results files have had this added, this Notebook will become unnecessary and can be deleted.

# Initial setup

In [None]:
# Import necessary libraries

import pandas as pd
from matcher import Text, Matcher
from IPython.display import clear_output
import json

In [None]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Proust/1922_SwannsMoncrieff/Results/Proust_1922_SwannsMoncrieff_results_t2-c3-n2-m3-nostops.jsonl"

In [None]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

In [None]:
# Load the source text in which you detected quotations

sourceTextPath = f"{sourceDir}/{projectName}_plaintext.txt"
with open(sourceTextPath) as f: 
    rawText = f.read()

sourceText = Text(rawText, projectName)

In [None]:
# Load in the JSONL file with the full text of JSTOR articles

corpusDF = pd.read_json(f"{corpusDir}/{projectName}_fulltext.jsonl", lines=True)
print(f"Loaded {len(corpusDF)} full-text items")

# Create new dataframe with just full text and id
fulltextDF = corpusDF[["fullText","id"]]
del corpusDF

In [None]:
# Load in the JSONL file with the results of text matcher

resultsDF = pd.read_json(f"{startData}", lines=True)
print(f"Loaded {len(resultsDF)} results from text-matcher")

In [None]:
# Merge full text column with results dataset

df = pd.merge(fulltextDF, resultsDF, on="id")

In [None]:
# Free up memory from initial separate dataframes

del fulltextDF
del resultsDF

# Generate data on quoted passages and context chunks

In [None]:
# ACTION: specify size of context chunks in characters

chunkSizeLeft = 250
chunkSizeRight = 750

In [None]:
# For each match, this cell extracts the quoted passage in the source text, the quoted passage in the corpus
# and context chunks left and right of the quoted passage in the corpus

# Create empty columns
df = df.assign(quotedPassageinA = "", quotedPassageinB = "", contextChunkLeft = "", contextChunkRight = "")

# Loop over each item in the corpus
for item in range(len(df)):
    
    clear_output()
    print(f"Now extracting quotations and context chunks for item {item + 1} of {len(df)}")

    # Start with empty lists for quotated passages and context chunks
    allPassagesInA = []
    allPassagesInB = []
    allChunksLeft = []
    allChunksRight = []
    
    # Process the given corpus item text to be in matcher format
    corpusItemText = Text(df['fullText'].iloc[item], projectName)
    
    # Loop over each match for the given item
    for match in range(len(df["Locations in B"].iloc[item])):
        
        # Specify start and end indexes in source text
        locInA = df["Locations in A"].iloc[item][match]
        
        # Append quoted passage in A to list of all quoted passages for item
        allPassagesInA.append(sourceText.text[locInA[0]:locInA[1]])

        # Specify start and end indexes in item from corpus
        locInB = df["Locations in B"].iloc[item][match]
        
        # Append quoted passage in B to list of all quoted passages for item
        allPassagesInB.append(corpusItemText.text[locInB[0]:locInB[1]])
        
        # Append left context chunk to list of all left context chunks for item
        allChunksLeft.append(corpusItemText.text[locInB[0] - chunkSizeLeft:locInB[0]])
        
        # Append right context chunk to list of all right context chunks for item
        allChunksRight.append(corpusItemText.text[locInB[1]:locInB[1] + chunkSizeRight])
        
    # Assign lists of quoted passages and context chunks to relevant column and row of dataframe
    df["quotedPassageinA"].iat[item] = allPassagesInA
    df["quotedPassageinB"].iat[item] = allPassagesInB
    df["contextChunkLeft"].iat[item] = allChunksLeft
    df["contextChunkRight"].iat[item] = allChunksRight

In [None]:
# Drop corpus full text from dataset (important for copyright protection and to reduce file size)

df = df.drop(['fullText'], axis=1)

In [None]:
# Save pandas dataframe as JSONL file

df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}.jsonl", orient='records', lines=True)