# Run matcher

This notebook is the core of Phase 2, which is one of 3 phases in the quotation detection pipeline:

1. Prepare inputs: prepare the Source Text and Target Corpus - give standard filenames, convert to correct formats, organize in standard folder structure

2. Detect quotations: run matcher algorithm on Source Text and Target Corpus, optionally do quality control on results

3. Analyze results: various ways to analyze the data on quotations

Technical note: if you're working with large files in this notebook, memory may become a problem. Closing Jupyter Notebook and re-running with the following command may improve things:

> jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [1]:
# import libraries needed
import sys
import os

import pandas as pd
import numpy as np


try:   
   from IPython.display import clear_output
except:
   !{sys.executable} -m pip install IPython.display 
   from IPython.display import clear_output

try:
   from matcher import Text, Matcher
except:
   !{sys.executable} -m pip install matcher
   from matcher import Text, Matcher

try:
    import re
except:
    !{sys.executable} -m pip install re
    import re


try:
    import json
except:
    !{sys.executable} -m pip install json
    import json

# Specify project variables

In [2]:
# ACTION: copy path to Target Corpus JSONL file here

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Price/2000_AnthologyRise/TargetCorpus/Price_2000_AnthologyRise_fulltext.jsonl"

In [4]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 3)[-2]
publicationYear = startData.rsplit("_", 3)[-3]
authorSurname = startData.rsplit("_", 3)[-4]
authorSurname = authorSurname.rsplit("/", 1)[-1]
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Price
Publication year: 2000
Text title: AnthologyRise
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [5]:
# Specify hyperparameters for matcher algorithm (adjust if desired)

thresh = 2
cut = 3
ngram = 2
mindist = 3
nostops = True

hyperparSuffix = f"_t{thresh}-c{cut}-n{ngram}-m{mindist}-{'nostops' if nostops else 'stops'}"
print(f"Suffix to be appended to results: {hyperparSuffix}")

Suffix to be appended to results: _t2-c3-n2-m3-nostops


In [10]:
# Load the text you want to find quotations from.

sourceTextPath = f"{sourceDir}/{projectName}_plaintext.txt"
with open(sourceTextPath) as f: 
    rawText = f.read()

sourceText = Text(rawText, projectName)

# Load the corpus you want to find results in

corpusFile = f"{corpusDir}/{projectName}_fulltext.jsonl"
  
with open(corpusFile) as f:
    rawProcessedData = f.readlines()
data = [json.loads(line) for line in rawProcessedData]

In [7]:
matchesTally = 0

for i, article in enumerate(data): 
    clear_output()
    print('\r', '%s matches made so far. Now matching article %s of %s' % (matchesTally, i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['fullText'], article['id'], removeStopwords=nostops)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(sourceText, articleText, \
                threshold=thresh, cutoff=cut, ngramSize=ngram, \
                removeStopwords=nostops, minDistance=mindist).match()
        matchesTally = matchesTally + article['numMatches']
#        article['fullText'] = ''

 7455 matches made so far. Now matching article 1686 of 16871 total matches found.


match 1:
[32mPrice_2000_AnthologyRise[0m: (437726, 437756) Writing Women’s Literary History (Baltimore [31mJohns Hopkins University Press[0m Richard Halpern, The Poetics of Primitive Accumulation
[32mhttp://www.jstor.org/stable/461833[0m: (2479674, 2479704) discussion of a specific topic or area). $19.95 ] Johns Hopkins [31mJohns Hopkins University Press[0m Baltimore, Maryland 21218 807 f POETS 1 POETRY 1 PRINCETON


In [8]:
# Convert results to pandas dataframe

df = pd.DataFrame(data)

# Extract year from date published

df2 = df["datePublished"].str.split(pat="-", n=1, expand=True).rename({0: "Year"}, axis="columns")
df = pd.concat([df, df2["Year"]], axis=1)
df = df.astype({'Year': 'int64'})

# Derive decade from year

df['Decade'] = df['Year'] - (df['Year'] % 10)

In [9]:
# Rearrange columns to be more user-friendly

colToMove = df.pop("Year")
df.insert(2, "Year", colToMove)

colToMove = df.pop("Decade")
df.insert(3, "Decade", colToMove)

In [97]:
# Create new column and fill with quoted passages from the source text

df = df.assign(quotedPassageinA = "")

for item in range(len(df)):
    allPassagesInA = []
    for match in range(len(df["Locations in A"].iloc[item])):
        locInA = df["Locations in A"].iloc[item][match]
        passageInA = sourceText.text[locInA[0]:locInA[1]]
        allPassagesInA.append(passageInA)
    df["quotedPassageinA"].iat[item] = allPassagesInA

In [107]:
# ACTION: specify size of context chunks in characters

chunkSizeLeft = 250
chunkSizeRight = 750

In [128]:
# For each match, this cell extracts the quoted passage in the source text, the quoted passage in the corpus
# and context chunks left and right of the quoted passage in the corpus

# Create empty columns
df = df.assign(quotedPassageinA = "", quotedPassageinB = "", contextChunkLeft = "", contextChunkRight = "")

# Loop over each item in the corpus
for item in range(len(df)):
    
    clear_output()
    print(f"Now extracting quotations and context chunks for item {item + 1} of {len(df)}")

    # Create empty lists for quotations and context chunks
    allPassagesInA = []
    allPassagesInB = []
    allChunksLeft = []
    allChunksRight = []
    
    # Process the corpus item text to be in matcher format
    corpusItemText = Text(df['fullText'].iloc[item], projectName)
    
    # Loop over each match for the given item
    for match in range(len(df["Locations in B"].iloc[item])):
        
        # Specify start and end indexes in source text
        locInA = df["Locations in A"].iloc[item][match]
        
        # Append quoted passage in A to list of all quoted passages for item
        allPassagesInA.append(sourceText.text[locInA[0]:locInA[1]])

        # Specify start and end indexes in item from corpus
        locInB = df["Locations in B"].iloc[item][match]
        
        # Append quoted passage in B to list of all quoted passages for item
        allPassagesInB.append(corpusItemText.text[locInB[0]:locInB[1]])
        
        # Append left context chunk to list of all left context chunks for item
        allChunksLeft.append(corpusItemText.text[locInB[0] - chunkSizeLeft:locInB[0]])
        
        # Append right context chunk to list of all right context chunks for item
        allChunksRight.append(corpusItemText.text[locInB[1]:locInB[1] + chunkSizeRight])
        
    # Assign lists of quoted passages and context chunks to relevant column and row of dataframe
    df["quotedPassageinA"].iat[item] = allPassagesInA
    df["quotedPassageinB"].iat[item] = allPassagesInB
    df["contextChunkLeft"].iat[item] = allChunksLeft
    df["contextChunkRight"].iat[item] = allChunksRight

Now extracting quotations and context chunks for item 1687 of 1687


In [130]:
# Drop corpus full text from dataset

df = df.drop(['fullText'], axis=1)

In [131]:
df

Unnamed: 0,creator,datePublished,Year,Decade,docSubType,docType,doi,id,identifier,isPartOf,...,Locations in A,Locations in B,issueNumber,placeOfPublication,abstract,subTitle,quotedPassageinA,quotedPassageinB,contextChunkLeft,contextChunkRight
0,"[Henry Lowood, Stephen H. Cutcliffe, Katalin H...",1996-01-01,1996,1990,research-article,article,10.2307/3107088,http://www.jstor.org/stable/3107088,"[{'name': 'doi', 'value': '10.2307/3107088'}, ...",Technology and Culture,...,"[(3973, 4017), (142736, 142758), (437706, 4377...","[(260284, 260328), (316650, 316672), (538838, ...",,,,,[Ecole des Hautes\nEtudes en Sciences Sociales...,"[Ecole des hautes etudes en sciences sociales,...","[ zur Zephyr. Motorrader, die Geschichte macht...","[, 1994. Pp. 240; illustrations; bibliography...."
1,[W. B. Worthen],2002-04-01,2002,2000,research-article,article,10.2307/1556121,http://www.jstor.org/stable/1556121,"[{'name': 'doi', 'value': '10.2307/1556121'}, ...","Studies in English Literature, 1500-1900",...,[],[],2,,,,[],[],[],[]
2,,1968-03-08,1968,1960,misc,article,10.2307/1723593,http://www.jstor.org/stable/1723593,"[{'name': 'doi', 'value': '10.2307/1723593'}, ...",Science,...,"[(1432, 1451), (544001, 544019)]","[(99594, 99613), (130022, 130040)]",3819,,,,"[th Street, New York, New York: New York]","[th Street, New York, NEW YORK, NEW YORK]",[colonies. Request complete 12-page Heavy Oxyg...,"[, 28, N.Y. A | RESEARCH PRODUCTS DEPT. i_""_""r..."
3,[Robin L. Cadwallader],1997-01-01,1997,1990,misc,article,10.2307/25679222,http://www.jstor.org/stable/25679222,"[{'name': 'doi', 'value': '10.2307/25679222'},...",Legacy,...,[],[],1,,,,[],[],[],[]
4,,1995-04-01,1995,1990,misc,article,10.2307/467848,http://www.jstor.org/stable/467848,"[{'name': 'doi', 'value': '10.2307/467848'}, {...",MELUS,...,[],[],1,,,,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682,"[W Maxwell Cowan, Steven E Hyman, Thomas M Jes...",2002-12-01,2002,2000,book-review,article,10.1086/374516,http://www.jstor.org/stable/10.1086/374516,"[{'name': 'doi', 'value': '10.1086/374516'}, {...",The Quarterly Review of Biology,...,"[(1425, 1455)]","[(301207, 301236)]",4,,,,"[West 20th Street, New York, NY]","[West 57th Street, New York NY]","[iversity Press, 41 William Street, Prince- Un...","[ University Press of Mississippi, 3825 Ridgew..."
1683,"[Nigel Rapport, Esther Hertzog, Orit Abuhav, H...",2013-04-01,2013,2010,book-review,article,10.2307/23486376,http://www.jstor.org/stable/23486376,"[{'name': 'doi', 'value': '10.2307/23486376'},...",Anthropology Today,...,"[(469022, 469059)]","[(32269, 32306)]",2,,,,[Edinburgh: Edinburgh University Press],[Edinburgh: Edinburgh University Press],[7. Existentialism and humanism. London: build...,[. - 1994. The prose and the passion: Anthropo...
1684,,1984-12-01,1984,1980,misc,article,10.2307/341966,http://www.jstor.org/stable/341966,"[{'name': 'doi', 'value': '10.2307/341966'}, {...",Hispania,...,"[(544001, 544019)]","[(741628, 741646)]",4,,,,[New York: New York],"[New York, New York]","[er company's catalogue, we can supply it to y...",[ 10003 / (212) 673-7400 610 Fifth Avenue / Ne...
1685,,1995-11-01,1995,1990,misc,article,10.2307/462924,http://www.jstor.org/stable/462924,"[{'name': 'doi', 'value': '10.2307/462924'}, {...",PMLA,...,[],[],6,,,,[],[],[],[]


In [132]:
# Save as JSONL file for analysis and visualization

df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}.jsonl", orient='records', lines=True)

# Quality control

In [None]:
# Optional: reload results file (if resuming later)

#resultsData = f"{resultsDir}/Woolf_1925_Dalloway_results_t3-c3-n2-m5-nostops.jsonl"
#df = pd.read_json(resultsData, lines=True)

In [133]:
# Check how many items include at least 1 match

len(df.loc[df["numMatches"] >=1])

1340

In [134]:
# Check how many matches in total

pd.DataFrame.sum(df["numMatches"])

7456

In [135]:
# Check that matcher ran on all lines in the dataset - result should be 0

df.loc[pd.isnull(df['Locations in A'])]

Unnamed: 0,creator,datePublished,Year,Decade,docSubType,docType,doi,id,identifier,isPartOf,...,Locations in A,Locations in B,issueNumber,placeOfPublication,abstract,subTitle,quotedPassageinA,quotedPassageinB,contextChunkLeft,contextChunkRight


In [136]:
# Check some random items without matches to check for false negatives - easiest to refer to PDF

df.loc[df["numMatches"] == 0]

Unnamed: 0,creator,datePublished,Year,Decade,docSubType,docType,doi,id,identifier,isPartOf,...,Locations in A,Locations in B,issueNumber,placeOfPublication,abstract,subTitle,quotedPassageinA,quotedPassageinB,contextChunkLeft,contextChunkRight
1,[W. B. Worthen],2002-04-01,2002,2000,research-article,article,10.2307/1556121,http://www.jstor.org/stable/1556121,"[{'name': 'doi', 'value': '10.2307/1556121'}, ...","Studies in English Literature, 1500-1900",...,[],[],2,,,,[],[],[],[]
3,[Robin L. Cadwallader],1997-01-01,1997,1990,misc,article,10.2307/25679222,http://www.jstor.org/stable/25679222,"[{'name': 'doi', 'value': '10.2307/25679222'},...",Legacy,...,[],[],1,,,,[],[],[],[]
4,,1995-04-01,1995,1990,misc,article,10.2307/467848,http://www.jstor.org/stable/467848,"[{'name': 'doi', 'value': '10.2307/467848'}, {...",MELUS,...,[],[],1,,,,[],[],[],[]
6,[VeVe A. Clark],1979-09-01,1979,1970,research-article,article,10.2307/41066511,http://www.jstor.org/stable/41066511,"[{'name': 'doi', 'value': '10.2307/41066511'},...",The Black Scholar,...,[],[],1,,,,[],[],[],[]
11,,2001-12-01,2001,2000,misc,article,10.2307/359086,http://www.jstor.org/stable/359086,"[{'name': 'doi', 'value': '10.2307/359086'}, {...",College Composition and Communication,...,[],[],2,,,,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1664,[Leah Pennywark],2018-07-01,2018,2010,research-article,article,10.5325/jmodeperistud.9.2.0220,http://www.jstor.org/stable/10.5325/jmodeperis...,"[{'name': 'doi', 'value': '10.5325/jmodeperist...",The Journal of Modern Periodical Studies,...,[],[],2,,ABSTRACT From its origins in the 1940s through...,,[],[],[],[]
1665,"[Katharina J. Schreiber, Christina Conlee, Daw...",1999-12-31,1999,1990,misc,article,10.2307/23850500,http://www.jstor.org/stable/23850500,"[{'name': 'doi', 'value': '10.2307/23850500'},...",Latin American Antiquity,...,[],[],,,,,[],[],[],[]
1673,,1994-01-01,1994,1990,misc,article,10.2307/2706918,http://www.jstor.org/stable/2706918,"[{'name': 'doi', 'value': '10.2307/2706918'}, ...",International Organization,...,[],[],1,,,,[],[],[],[]
1675,,2003-05-01,2003,2000,misc,article,10.2307/3246411,http://www.jstor.org/stable/3246411,"[{'name': 'doi', 'value': '10.2307/3246411'}, ...",PAJ: A Journal of Performance and Art,...,[],[],2,,,,[],[],[],[]
