<font size=5>Run Text_Matcher Algorithm</font>

In [1]:
#Troubleshooting: !jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10 if you 
# get a message about the data rate limit
from matcher import Text, Matcher
import json
import pandas as pd
from IPython.display import clear_output

In [2]:
# ACTION: copy path to corpus data JSONL file here (filename should end "_fulltext.jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Woolf/1925_Dalloway/Corpus/Woolf_1925_Dalloway_fulltext.jsonl"

In [3]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 3)[-2]
publicationYear = startData.rsplit("_", 3)[-3]
authorSurname = startData.rsplit("_", 3)[-4]
authorSurname = authorSurname.rsplit("/", 1)[-1]
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Source"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Corpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Woolf
Publication year: 1925
Text title: Dalloway
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [5]:
# Specify hyperparameters for matcher algorithm (adjust if desired)

thresh = 2
cut = 3
ngram = 2
mindist = 3
nostops = True

hyperparSuffix = f"_t{thresh}-c{cut}-n{ngram}-m{mindist}-{'nostops' if nostops else 'stops'}"
print(f"Suffix to be appended to results: {hyperparSuffix}")

Suffix to be appended to results: _t2-c3-n2-m3-nostops


In [6]:
# Load the text you want to find quotations from.

sourceText = f"{sourceDir}/{projectName}_plaintext.txt"
with open(sourceText) as f: 
    rawText = f.read()

tx = Text(rawText, projectName)

# Load the corpus you want to find results in

corpusFile = f"{corpusDir}/{projectName}_fulltext.jsonl"
with open(corpusFile) as f:
    rawProcessedData = f.readlines()
data = [json.loads(line) for line in rawProcessedData]

In [7]:
for i, article in enumerate(data): 
    clear_output()
    print('\r', 'Matching article %s of %s' % (i, len(data)), end='')
    if 'numMatches' not in article: 
        articleText = Text(article['fullText'], article['id'], removeStopwords=nostops)
        article['numMatches'], article['Locations in A'], article['Locations in B'] = \
        Matcher(tx, articleText, \
                threshold=thresh, cutoff=cut, ngramSize=ngram, \
                removeStopwords=nostops, minDistance=mindist).match()
        article['fullText'] = ''

 Matching article 2494 of 24951 total matches found.


match 1:
[32mhttp://www.jstor.org/stable/3831515[0m: (396883, 396915) Women's Studies, XII:2 (1986), 167-178. Walker, Ronald [31mLeaden Circles Dissolving in Air[0m Narrative Rhythm and Meaning in Mrs. Dalloway


In [19]:
# Convert results to pandas dataframe

df = pd.DataFrame(data)

# Drop corpus full text from dataset

df = df.drop(['fullText'], axis=1)

# Extract year from date published

df2 = df["datePublished"].str.split(pat="-", n=1, expand=True).rename({0: "Year"}, axis="columns")
df = pd.concat([df, df2["Year"]], axis=1)
df = df.astype({'Year': 'int64'})

# Derive decade from year

df['Decade'] = df['Year'] - (df['Year'] % 10)

In [20]:
df.dtypes

creator                object
datePublished          object
docSubType             object
docType                object
id                     object
identifier             object
isPartOf               object
issueNumber            object
language               object
outputFormat           object
pageCount             float64
pageEnd                object
pageStart              object
pagination             object
provider               object
publicationYear         int64
publisher              object
sourceCategory         object
tdmCategory            object
title                  object
url                    object
volumeNumber           object
wordCount               int64
numMatches              int64
Locations in A         object
Locations in B         object
doi                    object
keyphrase              object
abstract               object
placeOfPublication     object
subTitle               object
Year                    int64
Decade                  int64
dtype: obj

In [21]:
# Rearrange columns to be more user-friendly

colToMove = df.pop("Year")
df.insert(2, "Year", colToMove)

colToMove = df.pop("Decade")
df.insert(3, "Decade", colToMove)

In [22]:
df.dtypes

creator                object
datePublished          object
Year                    int64
Decade                  int64
docSubType             object
docType                object
id                     object
identifier             object
isPartOf               object
issueNumber            object
language               object
outputFormat           object
pageCount             float64
pageEnd                object
pageStart              object
pagination             object
provider               object
publicationYear         int64
publisher              object
sourceCategory         object
tdmCategory            object
title                  object
url                    object
volumeNumber           object
wordCount               int64
numMatches              int64
Locations in A         object
Locations in B         object
doi                    object
keyphrase              object
abstract               object
placeOfPublication     object
subTitle               object
dtype: obj

In [23]:
# Save as JSONL file for analysis and visualization

df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}.jsonl", orient='records', lines=True)

# Quality control

In [26]:
# Optional: reload results file (if resuming later)

#resultsData = f"{resultsDir}/Woolf_1925_Dalloway_results_t3-c3-n2-m5-nostops.jsonl"
#df = pd.read_json(resultsData, lines=True)

In [27]:
# Check how many items include at least 1 match

len(df.loc[df["numMatches"] >=1])

518

In [28]:
# Check how many matches in total

pd.DataFrame.sum(df["numMatches"])

1758

In [None]:
# Check that matcher ran on all lines in the dataset - result should be 0

df.loc[pd.isnull(df['Locations in A'])]

In [14]:
# Check some random items without matches to check for false negatives - easiest to refer to PDF

df.loc[df["numMatches"] == 0]

Unnamed: 0,creator,datePublished,docSubType,docType,id,identifier,isPartOf,issueNumber,language,outputFormat,...,volumeNumber,wordCount,numMatches,Locations in A,Locations in B,doi,keyphrase,abstract,placeOfPublication,subTitle
0,[Ellen Tremper],1994-07-01,research-article,article,http://www.jstor.org/stable/3831453,"[{'name': 'issn', 'value': '0022281X'}, {'name...",Journal of Modern Literature,1,[eng],"[unigram, bigram, trigram]",...,19,5607,0,[],[],,,,,
1,"[Beth Rigel Daugherty, Arnold Weinstein]",2007-01-01,,document,http://www.jstor.org/stable/24907105,"[{'name': 'doi', 'value': '10.2307/24907105'},...",Woolf Studies Annual,,[eng],"[unigrams, bigrams, trigrams]",...,13,3021,0,[],[],10.2307/24907105,"[weinstein, readers, moderns, woolf faulkner, ...",,,
2,[Christine MacLeod],1997-01-01,book-review,article,http://www.jstor.org/stable/3509206,"[{'name': 'issn', 'value': '03062473'}, {'name...",The Yearbook of English Studies,,[eng],"[unigram, bigram, trigram]",...,27,1210,0,[],[],,,,,
4,[Frank Baldanza],1958-07-01,research-article,article,http://www.jstor.org/stable/41395513,"[{'name': 'doi', 'value': '10.2307/41395513'},...",,2,[eng],"[unigrams, bigrams, trigrams]",...,12,7337,0,[],[],10.2307/41395513,"[mccullers, miss mccullers, carson mccullers, ...",,,
5,"[Jeannette H. Foster, David Daiches]",1940-07-01,,document,http://www.jstor.org/stable/4302768,"[{'name': 'doi', 'value': '10.2307/4302768'}, ...","The Library Quarterly: Information, Community,...",3,[eng],"[unigrams, bigrams, trigrams]",...,10,1652,0,[],[],10.2307/4302768,"[daiches, bourgeois intellectual, civilization...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2490,"[Mia Carter, ﻣﻴﺎ ﻛﺎﺭﺗﺭ]",2007-01-01,,document,http://www.jstor.org/stable/30197973,"[{'name': 'doi', 'value': '10.2307/30197973'},...",Alif: Journal of Comparative Poetics,27,[eng],"[unigrams, bigrams, trigrams]",...,,11192,0,[],[],10.2307/30197973,"[woolfs, passionate apprentice, virginia woolf...","This article argues that Woolf's juvenilia, yo...",,
2491,"[Shyam M. Asnani, Mulk Raj Anand]",1983-03-01,,document,http://www.jstor.org/stable/23331532,"[{'name': 'doi', 'value': '10.2307/23331532'},...",Indian Literature,2,[eng],"[unigrams, bigrams, trigrams]",...,26,1706,0,[],[],10.2307/23331532,"[indian, mulk raj, raj anand, dobree, british ...",,,
2492,[Susan Stanford Friedman],2013-04-01,,document,http://www.jstor.org/stable/43653363,"[{'name': 'doi', 'value': '10.2307/43653363'},...",Tulsa Studies in Women's Literature,1,[eng],"[unigrams, bigrams, trigrams]",...,32,13023,0,[],[],10.2307/43653363,"[cosmopolitanism, three guineas, persepolis, w...",This comparative essay argues that Virginia Wo...,,
2493,[Judith Yaross Lee],2009-01-01,,document,http://www.jstor.org/stable/42573565,"[{'name': 'doi', 'value': '10.2307/42573565'},...",Studies in American Humor,19,[eng],"[unigrams, bigrams, trigrams]",...,,13888,0,[],[],10.2307/42573565,"[keatons, dialect, vernacular, american, socia...",,,
