# Getting "context chunks"

This notebook serves to generate data on the immediate contexts before and after every quotation detected.

# Initial setup

In [122]:
import pandas as pd
from text_matcher.matcher import Text, Matcher
from IPython.display import clear_output

In [123]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Woolf/1925_Dalloway/Results/Woolf_1925_Dalloway_results_t2-c3-n2-m3-nostops.jsonl"

In [3]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Woolf
Publication year: 1925
Text title: Dalloway
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [4]:
# Load source text

with open(f"{sourceDir}/{projectName}_plaintext.txt") as f: 
    rawText = f.read()

mm = Text(rawText, 'Middlemarch')

In [5]:
# Load in the JSONL file with the full text of JSTOR articles

corpusDF = pd.read_json(f"{corpusDir}/{projectName}_fulltext.jsonl", lines=True)
print(f"Loaded {len(corpusDF)} full-text items")

# Create new dataframe with just full text and id
fulltextDF = corpusDF[["fullText","id"]]
del corpusDF

Loaded 2495 full-text items


In [6]:
# Load in the JSONL file with the results of text matcher

resultsDF = pd.read_json(f"{startData}", lines=True)
print(f"Loaded {len(resultsDF)} results from text-matcher")

Loaded 2495 results from text-matcher


In [7]:
df = pd.merge(fulltextDF, resultsDF, on="id")

In [8]:
# Free up memory from initial separate dataframes

del fulltextDF
del resultsDF

In [9]:
# Drop items with no matches from results dataframe

print(f"Total number of items from JSTOR: {len(df)}")
df = df[df['numMatches']>=1]
print(f"Total number of items with at least one quotation detected: {len(df)}")

Total number of items from JSTOR: 2495
Total number of items with at least one quotation detected: 518


In [10]:
# Expand the dataframe so each quotation becomes its own row

df = df.explode(['Locations in A', 'Locations in B'])
print(f"Total number of quotations detected: {len(df)}")

Total number of quotations detected: 1758


In [11]:
# Replace old indices with new indices
df = df.reset_index()

# Generate context chunks

In [12]:
# Specify chunk size left and right
num_characters_before_quote = 250
num_characters_after_quote = 750

In [14]:
# Loop over each of the quotation start and end locations to produce
# left and right context chunks of specified sizes

allChunksLeft = []
allChunksRight = []
allQuotedPassagesinB = []

for item in range(len(df)):
    article_URL = df['id'].iloc[item]
    article_title = df['title'].iloc[item]
    startandEndLocations = df['Locations in B'].iloc[item]
    article_index = df[df['id'] == article_URL].index[0]
    article_text = df['fullText'].iloc[article_index]

    cleaned_article_text = Text(article_text, article_title)

    clear_output()
    print(f"Now extracting context chunks for quotation {item + 1} of {len(df)}")
    
    # Append the specified-size chunk left to complete list of left chunks
    allChunksLeft.append((cleaned_article_text.text[startandEndLocations[0]-num_characters_before_quote:startandEndLocations[0]]))
    # Append the specified-size chunk right to complete list of right chunks
    allChunksRight.append((cleaned_article_text.text[startandEndLocations[1]:startandEndLocations[1]+num_characters_after_quote]))
    # Append the quotation as it appears in the item to complete list of quoted passages
    allQuotedPassagesinB.append((cleaned_article_text.text[startandEndLocations[0]:startandEndLocations[1]]))

Now extracting context chunks for quotation 1758 of 1758


In [15]:
# Convert lists into pandas dataframes
dfLeft = pd.DataFrame(allChunksLeft, columns=['contextChunksLeft'])
dfRight = pd.DataFrame(allChunksRight, columns=['contextChunksRight'])
dfQuotesB = pd.DataFrame(allQuotedPassagesinB, columns=['quotedPassagesinB'])

In [16]:
# Merge left and right context chunks and quoted passages into main dataframe
df = pd.merge(df, dfLeft, left_index=True, right_index=True)
df = pd.merge(df, dfRight, left_index=True, right_index=True)
df = pd.merge(df, dfQuotesB, left_index=True, right_index=True)

In [17]:
# Drop full text column
df = df.drop("fullText", axis=1)

In [19]:
# Save as JSONL file
df.to_json(path_or_buf=f"{resultsDir}/{projectName}_results{hyperparSuffix}_contextChunks.jsonl", orient='records', lines=True)

# Output entire left/right contexts and quoted passages as text files (eg for word cloud)

In [22]:
# Output all left/right contexts as single text file

output_file = open(f'{resultsDir}/{projectName}-allQuotationContexts.txt', mode='w', encoding='utf-8')

for chunk in allChunksLeft:
    output_file.write(chunk)
    output_file.write('\n')

for chunk in allChunksRight:
    output_file.write(chunk)
    output_file.write('\n')
    
output_file.close()

# Output all quoted passages as single text file

output_file = open(f'{resultsDir}/{projectName}-allQuotedPassagesinB.txt', mode='w', encoding='utf-8')
for passage in allQuotedPassagesinB:
    output_file.write(passage)
    output_file.write('\n')

output_file.close()

# OPTIONAL: reimport JSONL file

In [4]:
df = pd.read_json(f"{resultsDir}/{projectName}_results{hyperparSuffix}_contextChunks.jsonl", lines=True)


# Calculate percentage change for keywords over time

In [5]:
# Merge left Chunk and right Chunk for each subset
df["fullContextChunks"] = df["contextChunksLeft"] + " " + df["contextChunksRight"]

In [7]:
# Stem all text

import nltk

stemmer = nltk.SnowballStemmer("english")
df['stemmedChunks'] = df.fullContextChunks.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
df.stemmedChunks.head()

0    rther comparison which illumin what meredith i...
1    h a littl squeak of the hinges" (p. 5), and th...
2     dalloway from an acquaint pass by, which prov...
3    he air. (p. 6) the leaden circl are to provid ...
4    e and make it possibl for the reader to map th...
Name: stemmedChunks, dtype: object

In [8]:
# Specify year at which the second period should start

transitionPoint = 2010

In [9]:
# Create lists to contain text chunks

firstPeriodChunks = []
secondPeriodChunks = []

# Append stemmed chunks to respective lists

for item in range(len(df)):
    currentChunk = df['stemmedChunks'].iloc[item]
    if df['Year'].iloc[item] < transitionPoint:
        firstPeriodChunks.append(currentChunk)
    if df['Year'].iloc[item] >= transitionPoint:
        secondPeriodChunks.append(currentChunk)
        
print(f"Items in first period up to (but excluding) {transitionPoint}: {len(firstPeriodChunks)}")
print(f"Items in second period starting from (and including) {transitionPoint}: {len(secondPeriodChunks)}")

Items in first period up to (but excluding) 2010: 1356
Items in second period starting from (and including) 2010: 402


In [10]:
# Convert lists into strings

firstPeriodString = ' '.join(firstPeriodChunks)
secondPeriodString = ' '.join(secondPeriodChunks)

In [92]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

firstPeriodTokens = tokenizer.tokenize(firstPeriodString)
secondPeriodTokens = tokenizer.tokenize(secondPeriodString)

In [93]:
print("Total tokens in first period: " , len(firstPeriodTokens))
print("Total tokens in second period: " , len(secondPeriodTokens))

Total tokens in first period:  228723
Total tokens in second period:  66704


In [106]:
# Tally up unique tokens

from collections import Counter

firstPeriodTallies = Counter()
for word in firstPeriodTokens:
    firstPeriodTallies[word] += 1

secondPeriodTallies = Counter()
for word in secondPeriodTokens:
    secondPeriodTallies[word] += 1

In [107]:
# Sort tallies and reduce to top n tokens

firstPeriodTallies = firstPeriodTallies.most_common()[0:500]
secondPeriodTallies = secondPeriodTallies.most_common()[0:500]

In [108]:
# Create dataframe for top first period keyword  tallies

firstTalliesDF = pd.DataFrame(firstPeriodTallies, columns =['Token', 'FirstPeriodRawFreq'])
firstTalliesDF

Unnamed: 0,Token,FirstPeriodRawFreq
0,the,14077
1,of,8388
2,and,5957
3,to,5065
4,a,4902
...,...,...
495,character,56
496,art,56
497,role,56
498,people,56


In [109]:
# Create dataframe for top first period keyword  tallies

secondTalliesDF = pd.DataFrame(secondPeriodTallies, columns =['Token', 'SecondPeriodRawFreq'])

In [110]:
percentChangeDF = pd.merge(firstTalliesDF, secondTalliesDF, on="Token")

In [111]:
# Divide raw frequencies by total number of tokens for each period

percentChangeDF["FirstPeriodRelativeFreq"] = percentChangeDF["FirstPeriodRawFreq"] / len(firstPeriodTokens)
percentChangeDF["SecondPeriodRelativeFreq"] = percentChangeDF["SecondPeriodRawFreq"] / len(secondPeriodTokens)

In [112]:
percentChangeDF["PercentageChange"] = (percentChangeDF["SecondPeriodRelativeFreq"] - percentChangeDF["FirstPeriodRelativeFreq"]) / percentChangeDF["FirstPeriodRelativeFreq"] * 100

In [113]:
pd.set_option('display.max_rows', 100)

In [121]:
# Output 20 biggest increases

percentChangeDF.sort_values(by=["PercentageChange"], ascending=False)[0:20]

Unnamed: 0,Token,FirstPeriodRawFreq,SecondPeriodRawFreq,FirstPeriodRelativeFreq,SecondPeriodRelativeFreq,PercentageChange
377,space,63,49,0.000275,0.000735,166.694151
330,tree,75,54,0.000328,0.00081,146.882586
254,connect,102,73,0.000446,0.001094,145.403442
325,car,77,53,0.000337,0.000795,136.016902
324,move,77,51,0.000337,0.000765,127.110604
383,hous,62,41,0.000271,0.000615,126.751479
307,modern,82,53,0.000359,0.000795,121.625627
193,present,139,89,0.000608,0.001334,119.549861
318,kind,79,46,0.000345,0.00069,99.658913
391,dure,59,33,0.000258,0.000495,91.787319


In [119]:
# Output 20 biggest decreases

percentChangeDF.sort_values(by=["PercentageChange"])[0:20]

Unnamed: 0,Token,FirstPeriodRawFreq,SecondPeriodRawFreq,FirstPeriodRelativeFreq,SecondPeriodRelativeFreq,PercentageChange
29,p,912,41,0.003987,0.000615,-84.584878
116,walsh,229,25,0.001001,0.000375,-62.566323
115,did,230,26,0.001006,0.00039,-61.238241
157,hand,169,20,0.000739,0.0003,-59.421008
76,kilman,325,43,0.001421,0.000645,-54.632687
186,bruton,143,20,0.000625,0.0003,-52.04301
170,sir,157,22,0.000686,0.00033,-51.951372
160,william,162,23,0.000708,0.000345,-51.317734
71,some,328,49,0.001434,0.000735,-48.775209
140,power,191,29,0.000835,0.000435,-47.937791


# TBD: run tf-idf on filtered version of df compared to rest

In [None]:
# OPTIONAL: re-import JSONL file as pandas dataframe
df = pd.read_csv('my_data_with_text.csv')

In [26]:
df["fullContextChunks"] = df["contextChunksLeft"] + " " + df["contextChunksRight"]

In [54]:
from nltk.stem.snowball import SnowballStemmer

In [57]:
stemmer = SnowballStemmer("english")
df['stemmed'] = df.fullContextChunks.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
df.stemmed.head()

0    rther comparison which illumin what meredith i...
1    h a littl squeak of the hinges" (p. 5), and th...
2     dalloway from an acquaint pass by, which prov...
3    he air. (p. 6) the leaden circl are to provid ...
4    e and make it possibl for the reader to map th...
Name: stemmed, dtype: object

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [76]:
cvec = CountVectorizer(stop_words='english', min_df=2, max_df=0.5, ngram_range=(1,2))
cvec

In [77]:
# Calculate all the n-grams found in all documents
from itertools import islice
cvec.fit(df.stemmed)
list(islice(cvec.vocabulary_.items(), 20))

[('comparison', 6963),
 ('illumin', 15414),
 ('meredith', 19899),
 ('provid', 24771),
 ('begin', 4077),
 ('mrs', 20710),
 ('dalloway', 8154),
 ('virginia', 33543),
 ('woolf', 34783),
 ('open', 22031),
 ('novel', 21433),
 ('charact', 5663),
 ('foot', 12951),
 ('london', 18531),
 ('appar', 2855),
 ('arbitrari', 2989),
 ('sentenc', 27854),
 ('gradual', 13879),
 ('explained', 11726),
 ('fresh', 13216)]

In [78]:
# Check how many total n-grams we have
len(cvec.vocabulary_)

35472

In [81]:
cvec_counts = cvec.transform(df.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (1758, 35472)
nonzero count: 202159
sparsity: 0.32%


In [82]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

# OPTIONAL: filter results down to specific passage

In [None]:
# Define a function that can compare two intervals in the form of listed tuples, eg [x0, x1], [y0,y1]
def overlaps(range1, range2):
    try:
        if len(range1) == 2 and len(range2) == 2:
            return max(range1[0],range2[0]) < min(range1[1],range2[1])
        else:
            return False
    except TypeError:
        return False

In [None]:
# This cell specifies the start and end indices of the passage 

passageName = "romanticSide"
quoteIndex = [4375, 4407]

In [None]:
df["doesOverlap"] = df.apply(lambda x: overlaps(quoteIndex, x['Locations in A']), axis=1)


In [None]:
df = df[df["doesOverlap"]]
df = df.reset_index()