In [1]:
from collections import defaultdict
from datetime import datetime
import os
import random

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

In [3]:
from wimbd.es import es_init
from wimbd.es import count_documents_containing_phrases
from wimbd.es import get_documents_containing_phrases

## **Test WIMBD**

In [4]:
es = es_init(config='es_config.yml')

In [5]:
count_documents_containing_phrases("docs_v1.5_2023-11-02", ["terms of use", "legally binding"])  # list of word sequences

6985853

In [6]:
for d in get_documents_containing_phrases("docs_v1.5_2023-11-02", ["Love is too young to know what conscience is"]):
    print(d)

{'_index': 'docs_v1.5_2023-11-02', '_id': 'afeb9477761ac823d2f30debfa0a92e3', '_score': 33.279945, '_source': {'id': 'afeb9477761ac823d2f30debfa0a92e3', 'dolma_id': 'b93de148eee0e9afe996e1d71d458a93', 'archive': 's3://ai2-llm/pretraining-data/sources/olmo-mix/v1_5/documents/c4/c4-0018.json.gz', 'line': 842670, 'source': 'c4', 'title': 'William Shakespeare online - William Shakespeare Sonnet…', 'snippet': 'William Shakespeare online - William Shakespeare Sonnet 151 Love is too young to know what conscience is. Famous William Shakespeare love poem known as Shakespearean Sonnet 151 Love is too young to know what conscience is. Famous Shakespearean sonnet, or short poem, entitled William Shakespeare…', 'text': 'William Shakespeare online - William Shakespeare Sonnet 151 Love is too young to know what conscience is. Famous William Shakespeare love poem known as Shakespearean Sonnet 151 Love is too young to know what conscience is. Famous Shakespearean sonnet, or short poem, entitled William

## **Load poetry data** 

In [7]:
poems_df = pd.read_csv('')
len(poems_df)

3875

In [24]:
poems_df.sample(3)

Unnamed: 0,messy_author,author,additional_authors,birth_death_dates,poem_title,poem_text,form,tags,poem_source,poem_link,...,birth_year,death_year,form_tags,theme_tags,occasion_tags,collected_from,also_appears_in_poetry_foundation,poem_title_lower,author_lower,form_group
2344,By William Butler Yeats,William Butler Yeats,['By William Butler Yeats'],1865–1939,Lapis Lazuli,(for Harry Clifton)\n\nI have heard that hyste...,aubade,"['Arts & Sciences', 'Poetry & Poets', 'Mytholo...","W. B. Yeats, “Lapis Lazuli” from The Poems of ...",https://www.poetryfoundation.org/poems/43297/l...,...,,,,,,Poetry Foundation,,lapis lazuli,william butler yeats,types/modes
2106,By Henry Timrod,Henry Timrod,['By Henry Timrod'],1828–1867,Charleston,Calm as that second summer which precedes\n ...,quatrain,"['Social Commentaries', 'History & Politics', ...",Source:\n “Words for th...,https://www.poetryfoundation.org/poems/55910/c...,...,,,,,,Poetry Foundation,,charleston,henry timrod,stanza forms
1417,By Geoffrey Hill,Geoffrey Hill,['By Geoffrey Hill'],1932–2016,An Apology for the Revival of Christian Archit...,"the spiritual, Platonic old England …\nS. T. C...",sonnet,"['Religion', 'Christianity', 'Arts & Sciences'...","Geoffrey Hill, “An Apology for the Revival of ...",https://www.poetryfoundation.org/poems/48462/a...,...,,,,,,Poetry Foundation,,an apology for the revival of christian archit...,geoffrey hill,verse forms


In [25]:
poems_df['form_group'].value_counts(dropna=False)

form_group
types/modes     1393
verse forms     1193
meters           708
stanza forms     581
Name: count, dtype: int64

In [26]:
poems_df.columns

Index(['messy_author', 'author', 'additional_authors', 'birth_death_dates',
       'poem_title', 'poem_text', 'form', 'tags', 'poem_source', 'poem_link',
       'author_link', 'pub_year', 'birth_year', 'death_year', 'form_tags',
       'theme_tags', 'occasion_tags', 'collected_from',
       'also_appears_in_poetry_foundation', 'poem_title_lower', 'author_lower',
       'form_group'],
      dtype='object')

In [31]:
poems_df['poem_id'] = poems_df['poem_link'].apply(lambda x: '/'.join(x.split('/')[-2:]))

In [32]:
len(poems_df['poem_id'].unique())

3692

In [101]:
line_dicts = []

poem_dropped_dict = defaultdict(list)

for i, r in poems_df.drop_duplicates(subset=['poem_id']).sample(frac=1).iterrows():

    if not pd.isnull(r['poem_text']):

        _lines = r['poem_text'].split('\n')
        
        _line_num = 0

        for _line in _lines:

            if _line.strip() and len(_line.strip().split()) >= 4:

                line_dicts.append({'line_id': r['poem_id'] + '_' + str(_line_num),
                                   'poem_id': r['poem_id'],
                                   'line_num': _line_num,
                                   'text': _line.strip()})
                                
            elif _line.strip():

                poem_dropped_dict[r['poem_id']].append(_line.strip())
            
            _line_num += 1
            
line_df = pd.DataFrame(line_dicts)
len(line_df.index)

134459

In [102]:
len(line_df['poem_id'].unique())

3684

In [103]:
dropped_list = [len(_ids) for _ids in poem_dropped_dict.values()]

np.mean(dropped_list), np.median(dropped_list), np.std(dropped_list), np.max(dropped_list)

(7.980225988700565, 3.0, 18.31244533606086, 398)

In [104]:
for _poem, _dropped in random.sample(poem_dropped_dict.items(), 1):
    for d in _dropped:
        print(d) 

arthritic from holding


In [105]:
line_df.head(3)

Unnamed: 0,line_id,poem_id,line_num,text
0,poem/time-sensitive_0,poem/time-sensitive,0,When the tyrant’s voice comes on the car radio...
1,poem/time-sensitive_2,poem/time-sensitive,2,"Every morning, I stretch, put food in my throa..."
2,poem/time-sensitive_4,poem/time-sensitive,4,"At night, I sit down to watch last year’s exti..."


In [106]:
# line_df.to_csv('') # Don't re-run because it will shuffle!

In [107]:
len(line_df.index)

134459

## **Run queries!**

In [4]:
line_df = pd.read_csv('')
len(line_df.index)

134459

In [5]:
line_lengths = [len(r['text'].split()) for i, r in line_df.iterrows()]

np.mean(line_lengths), np.median(line_lengths), np.std(line_lengths), np.min(line_lengths), np.max(line_lengths)

(8.440625023241285, 7.0, 14.027444228161551, 4, 1678)

In [7]:
output_directory_path = ''

df_list = np.array_split(line_df, 10000)

for j, _df in enumerate(df_list):

    if not os.path.exists(output_directory_path + '/wimbd_results.' + str(j) + '.csv'):

        print(datetime.now(), '\t', j, '\t', len(_df.index))

        _output_dicts = []
        for i, r in _df.iterrows():
            
            _query_text = ' '.join(r['text'].split()[:20])
            
            try:
                for d in get_documents_containing_phrases('docs_v1.5_2023-11-02', [_query_text]):
                    d['poem_id'] = r['poem_id']
                    d['line_num'] = r['line_num']
                    d['line_id'] = r['line_id']
                    # d['queried_text'] = _query_text
                    _output_dicts.append(d)
            except:
                print(_query_text)

        _output_df = pd.DataFrame(_output_dicts)
        _output_df.to_csv(output_directory_path + '/wimbd_results.' + str(j) + '.csv')

In [110]:
results_df = pd.read_csv('')
len(results_df.index)

417

In [111]:
results_df.sample(3)

Unnamed: 0.1,Unnamed: 0,_index,_id,_score,_source,poem_id,line_num,line_id
311,311,docs_v1.5_2023-11-02,1bcec670957e904e246ed6b41ded6e78,19.774265,"{'id': '1bcec670957e904e246ed6b41ded6e78', 'do...",poem/listening-0,0,poem/listening-0_0
90,90,docs_v1.5_2023-11-02,1d37cf77548e6dd3e5139e478a514a26,27.505978,"{'id': '1d37cf77548e6dd3e5139e478a514a26', 'do...",poem/clothes,7,poem/clothes_7
335,335,docs_v1.5_2023-11-02,257d21acba102355998545379aebb989,0.699776,"{'id': '257d21acba102355998545379aebb989', 'do...",poem/listening-0,2,poem/listening-0_2


In [112]:
results_df['poem_id'].value_counts()

poem_id
poem/clothes                          120
44194/the-debt-56d22331136b0          110
poem/listening-0                       93
poem/lines-skull                       33
poem/all-about-you                     30
57697/enlightenment-56d23b7175cc0      18
poem/three-demons-sanki-series-i-0     10
51516/sad-wine-i                        3
Name: count, dtype: int64