<font size=5>Run Text_Matcher Algorithm</font>

In [4]:
#Troubleshooting: !jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 if you 
# get a message about the data rate limit
from matcher import Text, Matcher
import json
import pandas as pd
from IPython.display import clear_output

In [1]:
# ACTION: copy path to corpus data JSONL file here (filename should end "_fulltext.jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Woolf/1925_Dalloway/Corpus/Woolf_1925_Dalloway_fulltext.jsonl"

In [2]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 3)[-2]
publicationYear = startData.rsplit("_", 3)[-3]
authorSurname = startData.rsplit("_", 3)[-4]
authorSurname = authorSurname.rsplit("/", 1)[-1]
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nData directory:{dataDir}")

Author surname: Woolf
Publication year: 1925
Text title: Dalloway
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [6]:
# Load variables
projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Source"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Corpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

In [7]:
corpusFile = f"{corpusDir}/{projectName}_fulltext.jsonl"
with open(corpusFile) as f:
    rawProcessedData = f.readlines()
data = [json.loads(line) for line in rawProcessedData]

# Load the text you want to find quotations from. Replace text with the text file you want to use.

sourceText = f"{sourceDir}/{projectName}_plaintext.txt"
with open(sourceText) as f: 
    rawText = f.read()

tx = Text(rawText, projectName)

In [8]:
for i, article in enumerate(data): 
    clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['fullText'], article['id'])
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(tx, articleText).match()
        article['fullText'] = ''

 Matching article 2494 of 2495

In [9]:
df = pd.DataFrame(data)
df = df.drop(['fullText'], axis=1)

df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results.jsonl", orient='records', lines=True)

# Quality control

In [14]:
# Optional: reload results file (if resuming later)

resultsData = f"{resultsDir}/{projectName}_results.jsonl"
df = pd.read_json(resultsData, lines=True)

In [10]:
# Check that matcher ran on all lines in the dataset - result should be 0

df.loc[pd.isnull(df['Locations in A'])]

Unnamed: 0,creator,datePublished,docSubType,docType,id,identifier,isPartOf,issueNumber,language,outputFormat,...,volumeNumber,wordCount,numMatches,Locations in A,Locations in B,doi,keyphrase,abstract,placeOfPublication,subTitle


In [17]:
# Check how many items include at least 1 match

df.loc[df["numMatches"] >=1]

Unnamed: 0,creator,datePublished,docSubType,docType,id,identifier,isPartOf,language,outputFormat,pageCount,...,wordCount,numMatches,Locations in A,Locations in B,issueNumber,abstract,subTitle,keyphrase,collection,hasPartTitle
2,[Ian Finseth],1999-04-01,research-article,article,http://www.jstor.org/stable/27746772,"[{'name': 'issn', 'value': '00029823'}, {'name...","American Literary Realism, 1870-1910",[eng],"[unigram, bigram, trigram]",20.0,...,10192,1,"[[113193, 113396]]","[[52257, 52460]]",3,,,,,
15,[Gunther Teubner],1989-01-01,research-article,article,http://www.jstor.org/stable/3053760,"[{'name': 'issn', 'value': '00239216'}, {'name...",Law & Society Review,[eng],"[unigram, bigram, trigram]",31.0,...,14124,1,"[[121523, 121709]]","[[23103, 23288]]",5,,,,,
38,[Karen Elizabeth Bishop],2011-10-01,book-review,article,http://www.jstor.org/stable/41238302,"[{'name': 'issn', 'value': '02613050'}, {'name...",Bulletin of Latin American Research,[eng],"[unigram, bigram, trigram]",3.0,...,865,1,"[[333653, 333937]]","[[206, 489]]",4,,,,,
56,[Heike Schaefer],2015-01-01,research-article,article,http://www.jstor.org/stable/44071899,"[{'name': 'issn', 'value': '03402827'}, {'name...",Amerikastudien / American Studies,[eng],"[unigram, bigram, trigram]",18.0,...,9602,1,"[[52485, 52730]]","[[10716, 10960]]",1,While the study of networks has proliferated i...,,,,
63,[Henk-Jan Kooij],2015-11-01,research-article,article,http://www.jstor.org/stable/26098720,"[{'name': 'issn', 'value': '14730952'}, {'name...",Planning Theory,[eng],"[unigram, bigram, trigram]",21.0,...,10661,1,"[[106799, 106888]]","[[11698, 11788]]",4,AbstractA central question within planning the...,The innovation campus in the Netherlands,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10332,[Colin Pedley],1990-04-01,research-article,article,http://www.jstor.org/stable/24042557,"[{'name': 'issn', 'value': '00438006'}, {'name...",The Wordsworth Circle,[eng],"[unigram, bigram, trigram]",7.0,...,6019,1,"[[52485, 52730]]","[[30417, 30663]]",2,,,,,
10336,[Tim Murphy],2003-01-01,research-article,article,http://www.jstor.org/stable/23550011,"[{'name': 'issn', 'value': '09433058'}, {'name...",Method & Theory in the Study of Religion,[eng],"[unigram, bigram, trigram]",20.0,...,8693,1,"[[52485, 52730]]","[[46238, 46484]]",1,This work is a synopsis of an argument for a s...,,,,
10378,[Samuel Hideo Yamashita],1992-12-01,book-review,article,http://www.jstor.org/stable/2719183,"[{'name': 'issn', 'value': '00730548'}, {'name...",Harvard Journal of Asiatic Studies,[eng],"[unigram, bigram, trigram]",14.0,...,4781,1,"[[89391, 89609]]","[[8971, 9195]]",2,,,,,
10406,,1976-11-01,misc,article,http://www.jstor.org/stable/3129679,"[{'name': 'issn', 'value': '0065972X'}, {'name...",Proceedings and Addresses of the American Phil...,[eng],"[unigram, bigram, trigram]",49.0,...,12039,1,"[[887, 1029]]","[[52708, 52849]]",2,,,,,


In [13]:
# Check some random items without matches to check for false negatives - easiest to refer to PDF


df.loc[df["numMatches"] == 0]

Unnamed: 0,creator,datePublished,docSubType,docType,id,identifier,isPartOf,issueNumber,language,outputFormat,...,volumeNumber,wordCount,numMatches,Locations in A,Locations in B,doi,keyphrase,abstract,placeOfPublication,subTitle
0,[Ellen Tremper],1994-07-01,research-article,article,http://www.jstor.org/stable/3831453,"[{'name': 'issn', 'value': '0022281X'}, {'name...",Journal of Modern Literature,1,[eng],"[unigram, bigram, trigram]",...,19,5607,0,[],[],,,,,
1,"[Beth Rigel Daugherty, Arnold Weinstein]",2007-01-01,,document,http://www.jstor.org/stable/24907105,"[{'name': 'doi', 'value': '10.2307/24907105'},...",Woolf Studies Annual,,[eng],"[unigrams, bigrams, trigrams]",...,13,3021,0,[],[],10.2307/24907105,"[weinstein, readers, moderns, woolf faulkner, ...",,,
2,[Christine MacLeod],1997-01-01,book-review,article,http://www.jstor.org/stable/3509206,"[{'name': 'issn', 'value': '03062473'}, {'name...",The Yearbook of English Studies,,[eng],"[unigram, bigram, trigram]",...,27,1210,0,[],[],,,,,
4,[Frank Baldanza],1958-07-01,research-article,article,http://www.jstor.org/stable/41395513,"[{'name': 'doi', 'value': '10.2307/41395513'},...",,2,[eng],"[unigrams, bigrams, trigrams]",...,12,7337,0,[],[],10.2307/41395513,"[mccullers, miss mccullers, carson mccullers, ...",,,
5,"[Jeannette H. Foster, David Daiches]",1940-07-01,,document,http://www.jstor.org/stable/4302768,"[{'name': 'doi', 'value': '10.2307/4302768'}, ...","The Library Quarterly: Information, Community,...",3,[eng],"[unigrams, bigrams, trigrams]",...,10,1652,0,[],[],10.2307/4302768,"[daiches, bourgeois intellectual, civilization...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,"[Mia Carter, ﻣﻴﺎ ﻛﺎﺭﺗﺭ]",2007-01-01,,document,http://www.jstor.org/stable/30197973,"[{'name': 'doi', 'value': '10.2307/30197973'},...",Alif: Journal of Comparative Poetics,27,[eng],"[unigrams, bigrams, trigrams]",...,,11192,0,[],[],10.2307/30197973,"[woolfs, passionate apprentice, virginia woolf...","This article argues that Woolf's juvenilia, yo...",,
2491,"[Shyam M. Asnani, Mulk Raj Anand]",1983-03-01,,document,http://www.jstor.org/stable/23331532,"[{'name': 'doi', 'value': '10.2307/23331532'},...",Indian Literature,2,[eng],"[unigrams, bigrams, trigrams]",...,26,1706,0,[],[],10.2307/23331532,"[indian, mulk raj, raj anand, dobree, british ...",,,
2492,[Susan Stanford Friedman],2013-04-01,,document,http://www.jstor.org/stable/43653363,"[{'name': 'doi', 'value': '10.2307/43653363'},...",Tulsa Studies in Women's Literature,1,[eng],"[unigrams, bigrams, trigrams]",...,32,13023,0,[],[],10.2307/43653363,"[cosmopolitanism, three guineas, persepolis, w...",This comparative essay argues that Virginia Wo...,,
2493,[Judith Yaross Lee],2009-01-01,,document,http://www.jstor.org/stable/42573565,"[{'name': 'doi', 'value': '10.2307/42573565'},...",Studies in American Humor,19,[eng],"[unigrams, bigrams, trigrams]",...,,13888,0,[],[],10.2307/42573565,"[keatons, dialect, vernacular, american, socia...",,,
