# Analyze text data

This notebook serves to analyze data on two kinds of text from the results:
- quoted passages as they appear in the source text (quoted passages in A) and as they appear in the target corpus (quoted passages in B)
- context chunks to the left and right of each quoted passage, as it appears in the target corpus

# Initial setup

In [1]:
import pandas as pd
from text_matcher.matcher import Text, Matcher
from IPython.display import clear_output

In [46]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Price/2000_AnthologyRise/Results/Price_2000_AnthologyRise_results_t2-c3-n2-m3-nostops.jsonl"

In [47]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/SourceText"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/TargetCorpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Price
Publication year: 2000
Text title: AnthologyRise
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [48]:
# Load in the JSONL file with the results of text matcher

df = pd.read_json(f"{startData}", lines=True)
print(f"Loaded {len(df)} results from text-matcher")

Loaded 1687 results from text-matcher


In [58]:
# Check that the results file includes data on quoted passages and context chunks
# (Some older datasets didn't include this.)

# Check that the df includes the relevant columns

if {'quotedPassageinA', 'quotedPassageinB', 'contextChunkLeft', 'contextChunkRight'}.issubset(df.columns):
    print("All columns present.")

    # Check that all columns are completely filled
    
    if all(df[['quotedPassageinA', 'quotedPassageinB', 'contextChunkLeft', 'contextChunkRight']].notnull().all(axis=0)) == True:
        print("All columns are filled. You can proceed!")
        
    else:
        print("Some columns are missing data. Return to phase 02 to troubleshoot.")
    
else:
    print("Error: columns missing. Run the results through Notebook 'add-quoted-passages-context-chunks-to-results'.")

All columns present.
All columns are filled. You can proceed!


# Output entire left/right contexts and quoted passages as text files (eg for word cloud)

In [66]:
# Resume here: trying to output relevant df columns as string separated by newline

testString = df["contextChunkLeft"].to_string

print(testString)

<bound method Series.to_string of 0       [ zur Zephyr. Motorrader, die Geschichte macht...
1                                                      []
2       [colonies. Request complete 12-page Heavy Oxyg...
3                                                      []
4                                                      []
                              ...                        
1682    [iversity Press, 41 William Street, Prince- Un...
1683    [7. Existentialism and humanism. London: build...
1684    [er company's catalogue, we can supply it to y...
1685                                                   []
1686    [ its functions. The book can be employed eith...
Name: contextChunkLeft, Length: 1687, dtype: object>


In [60]:
# Output all left/right contexts as single text file

output_file = open(f'{resultsDir}/{projectName}-allQuotationContexts.txt', mode='w', encoding='utf-8')

for chunk in allChunksLeft:
    output_file.write(chunk)
    output_file.write('\n')

for chunk in allChunksRight:
    output_file.write(chunk)
    output_file.write('\n')
    
output_file.close()


TypeError: write() argument must be str, not list

In [None]:
# Output all quoted passages as single text file

output_file = open(f'{resultsDir}/{projectName}-allQuotedPassagesinB.txt', mode='w', encoding='utf-8')
for passage in allQuotedPassagesinB:
    output_file.write(passage)
    output_file.write('\n')

output_file.close()

# OPTIONAL: reimport JSONL file

In [4]:
df = pd.read_json(f"{resultsDir}/{projectName}_results{hyperparSuffix}_contextChunks.jsonl", lines=True)


# Calculate percentage change for keywords over time

In [18]:
# Merge left Chunk and right Chunk for each subset
df["fullContextChunks"] = df["contextChunksLeft"] + " " + df["contextChunksRight"]

In [59]:
# Stem all text

import nltk

stemmer = nltk.SnowballStemmer("english")
df['stemmedChunks'] = df.fullContextChunks.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
df.stemmedChunks.head()

0     defin the channel within which resourc flow. ...
1     the university, manchest mi3 9pl. o'higgins, ...
2    sequilibrium. this is particular the case in d...
3    properti and that the wealthiest among them in...
4    eyes, in my own hall. i'll find out how that d...
Name: stemmedChunks, dtype: object

In [60]:
# Specify year at which the second period should start

transitionPoint = 2010

In [61]:
# Create lists to contain text chunks

firstPeriodChunks = []
secondPeriodChunks = []

# Append stemmed chunks to respective lists

for item in range(len(df)):
    currentChunk = df['stemmedChunks'].iloc[item]
    if df['Year'].iloc[item] < transitionPoint:
        firstPeriodChunks.append(currentChunk)
    if df['Year'].iloc[item] >= transitionPoint:
        secondPeriodChunks.append(currentChunk)
        
print(f"Items in first period up to (but excluding) {transitionPoint}: {len(firstPeriodChunks)}")
print(f"Items in second period starting from (and including) {transitionPoint}: {len(secondPeriodChunks)}")

Items in first period up to (but excluding) 2010: 4893
Items in second period starting from (and including) 2010: 1455


In [62]:
# Convert lists into strings

firstPeriodString = ' '.join(firstPeriodChunks)
secondPeriodString = ' '.join(secondPeriodChunks)

In [88]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

firstPeriodTokens = tokenizer.tokenize(firstPeriodString)
secondPeriodTokens = tokenizer.tokenize(secondPeriodString)

In [91]:
print("Total tokens in first period: " , len(firstPeriodTokens))
print("Total tokens in second period: " , len(secondPeriodTokens))

Total tokens in first period:  833072
Total tokens in second period:  243876


In [92]:
# Tally up unique tokens

from collections import Counter

firstPeriodTallies = Counter()
for word in firstPeriodTokens:
    firstPeriodTallies[word] += 1

secondPeriodTallies = Counter()
for word in secondPeriodTokens:
    secondPeriodTallies[word] += 1

In [93]:
# Sort tallies and reduce to top n tokens

firstPeriodTallies = firstPeriodTallies.most_common()[0:500]
secondPeriodTallies = secondPeriodTallies.most_common()[0:500]

In [96]:
firstPeriodTallies[0][0]

'the'

In [27]:
# Create dataframe for top first period keyword  tallies

firstTalliesDF = pd.DataFrame(firstPeriodTallies, columns =['Token', 'FirstPeriodRawFreq'])

In [28]:
# Create dataframe for top first period keyword  tallies

secondTalliesDF = pd.DataFrame(secondPeriodTallies, columns =['Token', 'SecondPeriodRawFreq'])

In [29]:
percentChangeDF = pd.merge(firstTalliesDF, secondTalliesDF, on="Token")

In [30]:
# Divide raw frequencies by total number of tokens for each period

percentChangeDF["FirstPeriodRelativeFreq"] = percentChangeDF["FirstPeriodRawFreq"] / len(firstPeriodTokens)
percentChangeDF["SecondPeriodRelativeFreq"] = percentChangeDF["SecondPeriodRawFreq"] / len(secondPeriodTokens)

In [31]:
# Calculuate percentage change between first and second period
percentChangeDF["PercentageChange"] = (percentChangeDF["SecondPeriodRelativeFreq"] - percentChangeDF["FirstPeriodRelativeFreq"]) / percentChangeDF["FirstPeriodRelativeFreq"] * 100

In [104]:
# Drop 1-character tokens

percentChangeDF = percentChangeDF[percentChangeDF['Token'].map(len) > 1]

In [116]:
# Output 20 biggest increases

percentChangeDF.sort_values(by=["PercentageChange"], ascending=False)[0:20]

Unnamed: 0,Token,FirstPeriodRawFreq,SecondPeriodRawFreq,FirstPeriodRelativeFreq,SecondPeriodRelativeFreq,PercentageChange
298,fog,322,194,0.000387,0.000795,105.806617
118,ch,741,442,0.000889,0.001812,103.759345
397,continu,230,130,0.000276,0.000533,93.07631
370,object,254,134,0.000305,0.000549,80.212353
309,figur,315,152,0.000378,0.000623,64.83389
106,narrat,809,386,0.000971,0.001583,62.986734
391,paper,233,109,0.00028,0.000447,59.802677
334,victorian,289,135,0.000347,0.000554,59.569322
242,bucket,392,183,0.000471,0.00075,59.469818
350,question,277,123,0.000333,0.000504,51.683666


In [106]:
# Output 20 biggest decreases

percentChangeDF.sort_values(by=["PercentageChange"])[0:20]

Unnamed: 0,Token,FirstPeriodRawFreq,SecondPeriodRawFreq,FirstPeriodRelativeFreq,SecondPeriodRelativeFreq,PercentageChange
101,mrs,825,134,0.00099,0.000549,-44.516439
225,tulkinghorn,420,71,0.000504,0.000291,-42.253917
256,american,374,68,0.000449,0.000279,-37.891537
175,letter,509,95,0.000611,0.00039,-36.244259
229,general,413,80,0.000496,0.000328,-33.831177
276,whole,341,67,0.000409,0.000275,-32.882789
222,richard,422,84,0.000507,0.000344,-32.004478
163,studi,540,108,0.000648,0.000443,-31.68069
310,societi,314,63,0.000377,0.000258,-31.463113
167,court,525,106,0.00063,0.000435,-31.03003


In [117]:
# Output just tokens and percentage changes for top 20 increases and decreases

percentChangeDF[["Token", "PercentageChange"]].sort_values(by=["PercentageChange"], ascending=False)[0:20].to_csv(f'{resultsDir}/{projectName}_biggestIncreases_{transitionPoint}.csv',index=False)

percentChangeDF[["Token", "PercentageChange"]].sort_values(by=["PercentageChange"])[0:20].to_csv(f'{resultsDir}/{projectName}_biggestDecreases_{transitionPoint}.csv',index=False)

# TBD: run tf-idf on filtered version of df compared to rest

In [None]:
# OPTIONAL: re-import JSONL file as pandas dataframe
df = pd.read_csv('my_data_with_text.csv')

In [26]:
df["fullContextChunks"] = df["contextChunksLeft"] + " " + df["contextChunksRight"]

In [54]:
from nltk.stem.snowball import SnowballStemmer

In [57]:
stemmer = SnowballStemmer("english")
df['stemmed'] = df.fullContextChunks.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
df.stemmed.head()

0    rther comparison which illumin what meredith i...
1    h a littl squeak of the hinges" (p. 5), and th...
2     dalloway from an acquaint pass by, which prov...
3    he air. (p. 6) the leaden circl are to provid ...
4    e and make it possibl for the reader to map th...
Name: stemmed, dtype: object

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [76]:
cvec = CountVectorizer(stop_words='english', min_df=2, max_df=0.5, ngram_range=(1,2))
cvec

In [77]:
# Calculate all the n-grams found in all documents
from itertools import islice
cvec.fit(df.stemmed)
list(islice(cvec.vocabulary_.items(), 20))

[('comparison', 6963),
 ('illumin', 15414),
 ('meredith', 19899),
 ('provid', 24771),
 ('begin', 4077),
 ('mrs', 20710),
 ('dalloway', 8154),
 ('virginia', 33543),
 ('woolf', 34783),
 ('open', 22031),
 ('novel', 21433),
 ('charact', 5663),
 ('foot', 12951),
 ('london', 18531),
 ('appar', 2855),
 ('arbitrari', 2989),
 ('sentenc', 27854),
 ('gradual', 13879),
 ('explained', 11726),
 ('fresh', 13216)]

In [78]:
# Check how many total n-grams we have
len(cvec.vocabulary_)

35472

In [81]:
cvec_counts = cvec.transform(df.stemmed)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (1758, 35472)
nonzero count: 202159
sparsity: 0.32%


In [82]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(20)

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

# OPTIONAL: filter results down to specific passage

In [None]:
# Define a function that can compare two intervals in the form of listed tuples, eg [x0, x1], [y0,y1]
def overlaps(range1, range2):
    try:
        if len(range1) == 2 and len(range2) == 2:
            return max(range1[0],range2[0]) < min(range1[1],range2[1])
        else:
            return False
    except TypeError:
        return False

In [None]:
# This cell specifies the start and end indices of the passage 

passageName = "romanticSide"
quoteIndex = [4375, 4407]

In [None]:
df["doesOverlap"] = df.apply(lambda x: overlaps(quoteIndex, x['Locations in A']), axis=1)


In [None]:
df = df[df["doesOverlap"]]
df = df.reset_index()