In [1]:
import os
import json
import random
import time
import numpy as np
import pandas as pd
import requests
from typing import Any
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm, trange
import functools 
import re

pd.set_option('display.max_rows', 2000)

In [1]:
# https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information?resource=download

In [2]:
df = pd.read_csv('./data/poets.csv') 

In [24]:
def get_english_language_poets(file_name: str = "./../data/List of English-language poets - Wikipedia.html"):   
    """
    Function for extraction of english poets names from Wikipedia
    Args:
    file_name - HTML file with poets page
    Returns:
    names_list - list of names
    """
    with open(file_name) as fp:
        soup = BeautifulSoup(fp, "html.parser")
        
    all_containers = soup.find_all("div", class_="div-col")
    name_elements = []
    for c in all_containers:
        name_elements += c.findAll('li')
    names_list = map(lambda n: n.get_text(), name_elements)
    names_list = set(map(lambda n: re.sub("[\(\[].*?[\)\]]", "", n)[:-1], names_list))
    
    return names_list         

In [None]:
poets = get_english_language_poets()
df = df[df['artist'].isin(poets)]

In [3]:
df['lyrics_paragraphs_count'] = df['lyrics'].apply(lambda l: len(l.split('\n\n')))
df['len_words'] = df['lyrics'].apply(lambda l: len(l.split(' ')))

In [4]:
artists_to_filter = [
    'Kendrick Lamar',
    'Bob Dylan',
    'Christopher Nolan',
    'Henry Fielding',
    'Nick Cave',
    'Charlotte Lennox',
    'Ryan Adams'
]

df = df[(~df['artist'].isin(artists_to_filter))
        & (~df['lyrics'].str.contains("Verse|Hook|Intro"))
        & (df['language'] == "en")
        & (~df['title'].str.contains("Act |ACT |Chapter |Chap. |CHAPTER |Ch. |CHAP."))
        & (df['len_words'] <= 650)
       ]

In [5]:
ids = sorted(list(map(lambda i: int(i), df['id'].astype(int).values)))

In [18]:
len(ids)

19203

In [15]:
df[df['id'] == 123696]

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language,lyrics_paragraphs_count,len_words
3765,113937,A Secret told,misc,Emily Dickinson,2013,660,{},381\n\nA Secret told\nCeases to be a Secret — ...,123696,en,en,en,3,31


In [20]:
ids[19200]

7836984

### Get Annotataions from Genius

In [8]:
access_token # access token 

In [None]:
for i in range(len(ids)-7780):
    counter = i + 7780
    song_id = ids[counter]
    r = requests.get(f'https://api.genius.com/referents?song_id={song_id}&access_token={access_token}')
    if r.status_code == 200:
        with open(f'./data/referents/{song_id}.json', 'w') as f:
            json.dump(r.json(), f)
        if counter%20 == 0:
            time.sleep(1)
            print(f'processed {counter} poems')

In [6]:
def traverse_annotation_children(el: Any, annotations: list):
    if type(el) == dict:
        return traverse_annotation_children(el['children'], annotations)
    for c in el:
        if (type(c) == list and len(c) == 1) or type(c) == str:
            annotations.append(c)
        else:
            if 'children' in c:
                traverse_annotation_children(c['children'], annotations)

In [17]:
def get_structured_annotation(referents_object: list):
    """
    Flattens the annotations data structure into the list of annotations
    """
    outputs = []
    for ref in referents_object:
        output = {
            'fragment': ref['fragment'],
            'classification': ref['classification'],
            'annotations': []
        }
        annotations = ref['annotations']
        for a in annotations:
            annotation_object = {
                'verified': a['verified']
            }
            children = a['body']['dom']['children']
            annotated_children = []
            traverse_annotation_children(children, annotated_children)
            annotated_children = list(map(lambda t: t if len(t) > 0 else '\n', annotated_children))
            annotated_children = list(filter(lambda t: not 'http' in t, annotated_children))
            if len(annotated_children):
                if len(annotated_children[-1]) == 0:
                    annotated_children = annotated_children[:len(annotated_children)-2]
                annotated_children = ' '.join(annotated_children)
                annotation_object['text'] = annotated_children
                output['annotations'].append(annotation_object)
        outputs.append(output)
    return outputs

In [18]:
def merge_lyrics_with_annotation_by_paragraph(annotation_object: dict, lyrics: str):
    """
    Function that adds context to each referent with annotion.
    Creates context based on paragraph, not the lines.
    1. Splits into paragraphs '\n\n'
    2. Finds the paragraph of the referent.
    3. Does the same logic as "merge_lyrics_with_annotation" but only within the paragraph.
    """
    LINES_BEFORE = 8
    LINES_AFTER = 8
    
    paragraphs = list(filter(lambda l: len(l) > 0, lyrics.replace("’", "'").split('\n\n')))
    avg_lines_in_paragraph = np.mean([len(p.split('\n')) for p in paragraphs])
    paragraphs_lines = dict(zip(paragraphs, [p.replace("’", "'").split('\n') for p in paragraphs]))
    #poem_lines = list(filter(lambda l: len(l) > 0, lyrics.replace("’", "'").split('\n')))
    for fragment in annotation_object:
        referent = fragment['fragment'].replace("’", "'")
        if '\n' in referent:
            if referent[-1] == '\n':
                referent = referent[:-1]
            referent_list = referent.split('\n')
            referent_start, referent_end = referent_list[0], referent_list[-1]
        else:
            referent_start, referent_end = referent, referent
        
        paragraph_start = list(filter(lambda p: referent_start in p, paragraphs))[0]
        paragraph_end = list(filter(lambda p: referent_end in p, paragraphs))[0]
        
        index_start = paragraphs_lines[paragraph_start].index(referent_start)
        index_end = paragraphs_lines[paragraph_end].index(referent_end)
                
        context_before = paragraphs_lines[paragraph_start][max(index_start-LINES_BEFORE, 0):index_start]
        context_after = paragraphs_lines[paragraph_end][index_end+1:min(index_end+LINES_AFTER, len(paragraphs_lines[paragraph_end]))]
        print(context_before)
        print(referent)
        print(context_after)
        print('\n\n')

In [19]:
def merge_lyrics_with_annotation(annotation_object: dict, lyrics: str):
    """
    Function that adds context to each referent with annotion.
    """
    LINES_BEFORE = 12
    LINES_AFTER = 4
    escape_chars_filter = ''.join([chr(i) for i in range(1, 32)]).replace('\n','') + '\xa0'

    
    poem_lines = list(filter(lambda l: len(l) > 0, lyrics.replace("’", "'").split('\n')))
    poem_lines = list(map(lambda l: l.translate(str.maketrans('', '', escape_chars_filter)), poem_lines))
    results = []
    
    for fragment in annotation_object:
        referent = fragment['fragment'].replace("’", "'").translate(str.maketrans('', '', escape_chars_filter)) 
        if '\n' in referent:
            if referent[-1] == '\n':
                referent = referent[:-1]
            referent_list = referent.split('\n')
            referent_start, referent_end = referent_list[0], referent_list[-1]
        else:
            referent_start, referent_end = referent, referent
            
        index_start, index_end = -1, -1
        
#         if referent_start not in poem_lines \
#             and referent_end not in poem_lines:
#             #and referent_start == referent_end:
#             print('*')
#             target_poem_lines = list(filter(lambda p: referent_start.lower() in p.lower(), poem_lines))
#             if len(target_poem_lines):
#                 index_start = poem_lines.index(target_poem_lines[0])
#                 index_end = poem_lines.index(target_poem_lines[0])

        if referent_start in poem_lines and referent_end in poem_lines:
            index_start = poem_lines.index(referent_start)
            index_end = poem_lines.index(referent_end)
            
        if index_start > -1 and index_end > -1:
            context_before = poem_lines[max(index_start-LINES_BEFORE, 0):index_start]
            context_after = poem_lines[index_end+1:min(index_end+LINES_AFTER, len(poem_lines))]
            results.append({
                "content_before": context_before,
                "referent": referent,
                "context_after": context_after,
                "annotation": fragment['annotations'][0]['text']
            })
            
    return results

In [23]:
def json_to_annotation_dicts(folder_path: str = './data/referents', n: int = -1):
    """
    Opens json files with response from Genius and structures them into the dataset with annotations
    """
    files = os.listdir(folder_path)
    if n != -1:
        files = files[:n]
    for file_name in files:
        with open(f"{folder_path}/{file_name}", "r") as f:
            referents = json.load(f)['response']['referents']
            if len(referents):
                referents = list(filter(lambda r: r['classification'] == 'accepted', get_structured_annotation(referents)))
            if len(referents):
                print(merge_lyrics_with_annotation(referents, df[df['id'] == int(file_name.replace('.json', ''))]['lyrics'].values[0]))

In [None]:
json_to_annotation_dicts(n=10)

In [None]:
merge_lyrics_with_annotation(get_structured_annotation(referents), df[df['id'] == 1875]['lyrics'].values[0])