<font size=5>Run Text_Matcher Algorithm</font>

In [1]:
#Troubleshooting: !jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 if you 
# get a message about the data rate limit
from matcher import Text, Matcher
import json
import pandas as pd
from IPython.display import clear_output

In [3]:
# ACTION: copy path to corpus data JSONL file here (filename should end "_fulltext.jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Proust/1922_SwannsMoncrieff/Corpus/Proust_1922_SwannsMoncrieff_fulltext.jsonl"

In [4]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 3)[-2]
publicationYear = startData.rsplit("_", 3)[-3]
authorSurname = startData.rsplit("_", 3)[-4]
authorSurname = authorSurname.rsplit("/", 1)[-1]
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Source"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Corpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Proust
Publication year: 1922
Text title: SwannsMoncrieff
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [5]:
# Specify hyperparameters for matcher algorithm (adjust if desired)

thresh = 2
cut = 3
ngram = 2
mindist = 3
nostops = True

hyperparSuffix = f"_t{thresh}-c{cut}-n{ngram}-m{mindist}-{'nostops' if nostops else 'stops'}"
print(f"Suffix to be appended to results: {hyperparSuffix}")

Suffix to be appended to results: _t2-c3-n2-m3-nostops


In [10]:
# Load the text you want to find quotations from.

sourceText = f"{sourceDir}/{projectName}_plaintext.txt"
with open(sourceText) as f: 
    rawText = f.read()

tx = Text(rawText, projectName)

# Load the corpus you want to find results in

corpusFile = f"{corpusDir}/{projectName}_fulltext.jsonl"
with open(corpusFile) as f:
    rawProcessedData = f.readlines()
data = [json.loads(line) for line in rawProcessedData]

In [11]:
for i, article in enumerate(data): 
    clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['fullText'], article['id'], removeStopwords=nostops)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(tx, articleText, \
                threshold=thresh, cutoff=cut, ngramSize=ngram, \
                removeStopwords=nostops, minDistance=mindist).match()
        article['fullText'] = ''

 Matching article 21817 of 21818

In [12]:
# Convert results to pandas dataframe

df = pd.DataFrame(data)

# Drop corpus full text from dataset

df = df.drop(['fullText'], axis=1)

# Extract year from date published

df2 = df["datePublished"].str.split(pat="-", n=1, expand=True).rename({0: "Year"}, axis="columns")
df = pd.concat([df, df2["Year"]], axis=1)
df = df.astype({'Year': 'int64'})

# Derive decade from year

df['Decade'] = df['Year'] - (df['Year'] % 10)

In [13]:
# Rearrange columns to be more user-friendly

colToMove = df.pop("Year")
df.insert(2, "Year", colToMove)

colToMove = df.pop("Decade")
df.insert(3, "Decade", colToMove)

In [14]:
# Save as JSONL file for analysis and visualization

df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}.jsonl", orient='records', lines=True)

# Quality control

In [None]:
# Optional: reload results file (if resuming later)

#resultsData = f"{resultsDir}/Woolf_1925_Dalloway_results_t3-c3-n2-m5-nostops.jsonl"
#df = pd.read_json(resultsData, lines=True)

In [10]:
# Check how many items include at least 1 match

len(df.loc[df["numMatches"] >=1])

4851

In [11]:
# Check how many matches in total

pd.DataFrame.sum(df["numMatches"])

16686

In [12]:
# Check that matcher ran on all lines in the dataset - result should be 0

df.loc[pd.isnull(df['Locations in A'])]

Unnamed: 0,datePublished,docSubType,Year,Decade,docType,doi,id,identifier,isPartOf,issueNumber,...,url,wordCount,numMatches,Locations in A,Locations in B,creator,volumeNumber,abstract,placeOfPublication,subTitle


In [13]:
# Check some random items without matches to check for false negatives - easiest to refer to PDF

df.loc[df["numMatches"] == 0]

Unnamed: 0,datePublished,docSubType,Year,Decade,docType,doi,id,identifier,isPartOf,issueNumber,...,url,wordCount,numMatches,Locations in A,Locations in B,creator,volumeNumber,abstract,placeOfPublication,subTitle
0,2021-06-01,misc,2021,2020,article,10.2307/48660064,http://www.jstor.org/stable/48660064,"[{'name': 'doi', 'value': '10.2307/48660064'},...",James Joyce Broadsheet,119,...,http://www.jstor.org/stable/48660064,1966,0,[],[],,,,,
1,1978-10-01,,1978,1970,document,10.2307/26281985,http://www.jstor.org/stable/26281985,"[{'name': 'doi', 'value': '10.2307/26281985'},...",Modern Fiction Studies,3,...,http://www.jstor.org/stable/26281985,5634,0,[],[],[T. O. Beachcroft],24,,,
2,2002-10-01,research-article,2002,2000,article,,http://www.jstor.org/stable/3831651,"[{'name': 'issn', 'value': '0022281X'}, {'name...",Journal of Modern Literature,1,...,http://www.jstor.org/stable/3831651,6286,0,[],[],[Mauro Piccinini],26,,,
4,1999-11-01,book-review,1999,1990,article,,http://www.jstor.org/stable/517434,"[{'name': 'issn', 'value': '00346551'}, {'name...",The Review of English Studies,200,...,http://www.jstor.org/stable/517434,789,0,[],[],[Katherine Mullin],50,,,
5,2019-04-01,,2019,2010,document,10.2307/26885292,http://www.jstor.org/stable/26885292,"[{'name': 'doi', 'value': '10.2307/26885292'},...",James Joyce Literary Supplement,1,...,http://www.jstor.org/stable/26885292,3575,0,[],[],"[Michael Patrick Gillespie, Chrissie Van Mierlo]",33,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19707,2020-01-01,research-article,2020,2020,article,,http://www.jstor.org/stable/26924868,"[{'name': 'issn', 'value': '11108673'}, {'name...",Alif: Journal of Comparative Poetics,40,...,http://www.jstor.org/stable/26924868,8903,0,[],[],"[Levi Thompson, ليڤاي تومسون]",,This article argues for a new direction in com...,,Mapping East-East Exchanges between Arabic and...
19708,2013-01-01,book-review,2013,2010,article,10.2307/26376124,http://www.jstor.org/stable/26376124,"[{'name': 'doi', 'value': '10.2307/26376124'},...",PAJ: A Journal of Performance and Art,1,...,http://www.jstor.org/stable/26376124,2369,0,[],[],"[Jason Fitzgerald, David Greenspan, David Gree...",35,,,
19709,2007-01-01,research-article,2007,2000,article,10.2307/25571016,http://www.jstor.org/stable/25571016,"[{'name': 'doi', 'value': '10.2307/25571016'},...",,2,...,http://www.jstor.org/stable/25571016,1670,0,[],[],[Sean Latham],44,,,
19710,1957-02-01,research-article,1957,1950,article,,http://www.jstor.org/stable/372469,"[{'name': 'issn', 'value': '00100994'}, {'name...",College English,5,...,http://www.jstor.org/stable/372469,4056,0,[],[],[Eugene M. Waith],18,,,
