In [17]:
from collections import defaultdict
from datetime import datetime
import os
import random

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

import openai

In [18]:
openai.api_key = ''

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

## **Load poetry data from Melanie** 

In [4]:
poems_df = pd.read_csv('')
len(poems_df)

3875

In [5]:
poems_df.sample(3)

Unnamed: 0,messy_author,author,additional_authors,birth_death_dates,poem_title,poem_text,form,tags,poem_source,poem_link,...,birth_year,death_year,form_tags,theme_tags,occasion_tags,collected_from,also_appears_in_poetry_foundation,poem_title_lower,author_lower,form_group
2094,By Mario Chard,Mario Chard,['By Mario Chard'],Unknown,The Ground,Say they still\ntie ropes to the caskets\nof i...,tercet,"['Appeared in Poetry Magazine', 'Living', 'Dea...",Source:\n Poetry\n ...,https://www.poetryfoundation.org/poetrymagazin...,...,,,,,,Poetry Foundation,,the ground,mario chard,stanza forms
1439,By David Lehman,David Lehman,['By David Lehman'],b. 1948,Mythologies,I.\n \n The question is not how like the anim...,sonnet,"['Living', 'Growing Old', 'Life Choices', 'The...","David Lehman, ""Mythologies"" from Operation Mem...",https://www.poetryfoundation.org/poems/54877/m...,...,,,,,,Poetry Foundation,,mythologies,david lehman,verse forms
2078,By Laurie Ann Guerrero,Laurie Ann Guerrero,['By Laurie Ann Guerrero'],Unknown,Brownies of the Southwest: Troop 704,"Humanscape 62, 1970, by Melesio Casas\n\n\n\n\...",tercet,"['Appeared in Poetry Magazine', 'Living', 'Com...",You can read the rest of the PINTURA : PALABRA...,https://www.poetryfoundation.org/poetrymagazin...,...,,,,,,Poetry Foundation,,brownies of the southwest: troop 704,laurie ann guerrero,stanza forms


In [6]:
poems_df['form_group'].value_counts(dropna=False)

types/modes     1393
verse forms     1193
meters           708
stanza forms     581
Name: form_group, dtype: int64

In [7]:
poems_df.columns

Index(['messy_author', 'author', 'additional_authors', 'birth_death_dates',
       'poem_title', 'poem_text', 'form', 'tags', 'poem_source', 'poem_link',
       'author_link', 'pub_year', 'birth_year', 'death_year', 'form_tags',
       'theme_tags', 'occasion_tags', 'collected_from',
       'also_appears_in_poetry_foundation', 'poem_title_lower', 'author_lower',
       'form_group'],
      dtype='object')

In [8]:
poems_df['poem_id'] = poems_df['poem_link'].apply(lambda x: '/'.join(x.split('/')[-2:]))

In [9]:
len(poems_df['poem_id'].unique())

3692

In [10]:
line_dicts = []

poem_dropped_dict = defaultdict(list)

for i, r in poems_df.drop_duplicates(subset=['poem_id']).sample(frac=1).iterrows():

    if not pd.isnull(r['poem_text']):

        _lines = r['poem_text'].split('\n')
        
        _line_num = 0

        for _line in _lines:

            if _line.strip() and len(_line.strip().split()) >= 4:

                line_dicts.append({'line_id': r['poem_id'] + '_' + str(_line_num),
                                   'poem_id': r['poem_id'],
                                   'line_num': _line_num,
                                   'text': _line.strip()})
                                
            elif _line.strip():

                poem_dropped_dict[r['poem_id']].append(_line.strip())
            
            _line_num += 1
            
line_df = pd.DataFrame(line_dicts)
len(line_df.index)

134459

In [11]:
len(line_df['poem_id'].unique())

3684

In [12]:
dropped_list = [len(_ids) for _ids in poem_dropped_dict.values()]

np.mean(dropped_list), np.median(dropped_list), np.std(dropped_list), np.max(dropped_list)

(7.980225988700565, 3.0, 18.31244533606086, 398)

In [13]:
for _poem, _dropped in random.sample(poem_dropped_dict.items(), 1):
    for d in _dropped:
        print(d) 

(from Macbeth)


In [14]:
line_df.head(3)

Unnamed: 0,line_id,poem_id,line_num,text
0,poem/sonnet-laughing-below-unimagined-room_0,poem/sonnet-laughing-below-unimagined-room,0,"Laughing below, the unimagined room"
1,poem/sonnet-laughing-below-unimagined-room_1,poem/sonnet-laughing-below-unimagined-room,1,"in unimagined mouths, a turning mood"
2,poem/sonnet-laughing-below-unimagined-room_2,poem/sonnet-laughing-below-unimagined-room,2,speaking itself the way a fulling should


In [15]:
# line_df.to_csv('/Users/mariaa/Documents/data/poetry/lines_for_wimbd.csv') # Don't re-run because it will shuffle!

In [16]:
len(line_df.index)

134459

In [4]:
line_df = pd.read_csv('/Users/mariaa/Documents/data/poetry/lines_for_wimbd.csv')
len(line_df.index)

134459

In [5]:
line_lengths = [len(r['text'].split()) for i, r in line_df.iterrows()]

np.mean(line_lengths), np.median(line_lengths), np.std(line_lengths), np.min(line_lengths), np.max(line_lengths)

(8.440625023241285, 7.0, 14.027444228161551, 4, 1678)

## **Test query**

In [30]:
for i, r in poems_df.sample(3).iterrows():

    _lines = r['poem_text'].split('\n')
    _lines = [' '.join(l.split()[:20]) for l in _lines if len(l.split()) >= 4]

    _first_line = _lines[0]
        
    _prompt = 'What are the next five lines of the poem "' + r['poem_title'].strip() + '" by ' + r['author'].strip() + '?\n\nFirst Line: "' + _first_line + '"\n\nNext Lines:'

    print(_prompt)
    print() 
    
    _response = openai.ChatCompletion.create(model="gpt-4",
                                             messages=[{"role": "user", "content": _prompt}])
    _answer = _response['choices'][0]['message']['content']
    
    print('POEM ID:', r['poem_id'])
    print('FIRST LINE:', _first_line)
    print('ANSWER:', _answer)
    print()

What are the next five lines of the poem "A Little Called Pauline" by Gertrude Stein?

First Line: "A little called anything shows shudders."

Next Lines:

POEM ID: 52610/a-little-called-pauline
FIRST LINE: A little called anything shows shudders.
ANSWER: "Come and say what prints all day. 
A whole few watermelon. 
There is no pope. 
No cut in pennies and little dressing and choose wide soles 
And little spats really little spices."

What are the next five lines of the poem "first time" by Reina María Rodríguez?

First Line: "we went into a market—they call it a grocery—and you can’t imagine. fruit brilliant as magazine photos. all kinds of"

Next Lines:

POEM ID: 54757/first-time
FIRST LINE: we went into a market—they call it a grocery—and you can’t imagine. fruit brilliant as magazine photos. all kinds of
ANSWER: "meats on perfect display, cooked, raw, or cured. Fresh picked vegetables, nature's vibrant tableau,
flowers from each season; a paradise of consumer's lure. Every aisle, an

In [23]:
len(poems_df.index)

3875

## **Run over all the poems**

In [24]:
output_directory_path = ''

In [38]:
poems_df = pd.read_csv(output_directory_path + '/poems_shuffled_for_memorization_tests.csv')
len(poems_df)

3875

In [42]:
df_list = np.array_split(poems_df, 200)

for j, _df in enumerate(df_list):

    if not os.path.exists(output_directory_path + '/gpt4.next_five_lines.' + str(j) + '.csv'):

        print(datetime.now(), '\t', j)

        _output_dicts = []

        for i, r in _df.iterrows():

            _lines = r['poem_text'].split('\n')
            _lines = [' '.join(l.split()[:20]) for l in _lines if len(l.split()) >= 4]

            if _lines:
                _first_line = _lines[0]
                    
                _prompt = 'What are the next five lines of the poem "' + r['poem_title'].strip() + '" by ' + r['author'].strip() + '?\n\nFirst Line: "' + _first_line + '"\n\nNext Lines:'

                _response = openai.ChatCompletion.create(model="gpt-4",
                                                        messages=[{"role": "user", "content": _prompt}])
                _answer = _response['choices'][0]['message']['content']
        
                _output_dicts.append({'poem': r['poem_id'],
                                      'poem_text': r['poem_text'][:200],
                                      'first_lines': _first_line,
                                      'answer': _answer})

        _output_df = pd.DataFrame(_output_dicts)
        _output_df.to_csv(output_directory_path + '/gpt4.next_five_lines.' + str(j) + '.csv')

2024-06-15 18:12:48.262109 	 8
2024-06-15 18:13:46.359671 	 9
2024-06-15 18:14:49.359606 	 10
2024-06-15 18:15:34.527942 	 11
2024-06-15 18:16:24.193544 	 12
2024-06-15 18:17:21.691108 	 13
2024-06-15 18:18:16.772158 	 14
2024-06-15 18:19:08.180615 	 15
2024-06-15 18:19:54.113080 	 16
2024-06-15 18:20:47.004253 	 17
2024-06-15 18:21:33.414526 	 18
2024-06-15 18:22:19.548377 	 19
2024-06-15 18:23:08.722719 	 20
2024-06-15 18:24:02.651407 	 21
2024-06-15 18:24:56.086522 	 22
2024-06-15 18:25:49.605689 	 23
2024-06-15 18:26:47.823787 	 24
2024-06-15 18:27:45.994241 	 25
2024-06-15 18:28:32.769932 	 26
2024-06-15 18:29:24.842457 	 27
2024-06-15 18:30:22.250071 	 28
2024-06-15 18:31:12.629921 	 29
2024-06-15 18:32:03.133431 	 30
2024-06-15 18:32:52.402385 	 31
2024-06-15 18:33:45.886908 	 32
2024-06-15 18:34:46.689713 	 33
2024-06-15 18:35:53.642660 	 34
2024-06-15 18:36:51.985400 	 35
2024-06-15 18:37:45.228193 	 36
2024-06-15 18:38:36.627516 	 37
2024-06-15 18:39:30.696122 	 38
2024-06-15