## Ghost Writer Training Test

In [157]:
import pandas as pd
import numpy as np
import ast
import re

In [194]:
hip = pd.read_csv('hiphop_corpus_with_lyrics.csv')

## Data PreProcessing

Since writers is currently stored as a string gonna convert to list

In [195]:
hip['writers'] = hip['writers'].apply(ast.literal_eval)

Break up lyrics by Verse

In [159]:
def clean_lyrics(lyrics):
    #Breaking up lyrics by verse
    verses = re.split(r'\[.*?\]', lyrics)
    #Cleaning up extra whitespace
    verses = [re.sub(r'\s+', ' ', verse).strip() for verse in verses if verse.strip()]
    return verses

In [196]:
hip['cleaned_lyrics'] = hip['lyrics'].apply(clean_lyrics)
hip.head()

Unnamed: 0,artist,album,song_title,date,writers,lyrics,title_clean,cleaned_lyrics
0,Mac Miller,Circles,Circles,"January 17, 2020",[Mac Miller],"[Verse 1]\nWell, this is what it look like rig...",circles,"[Well, this is what it look like right before ..."
1,Mac Miller,Circles,Complicated,"January 17, 2020","[Mac Miller, Jon Brion]","[Verse 1]\nOutside is cloudy, but I like that ...",complicated,"[Outside is cloudy, but I like that better (Be..."
2,Mac Miller,Circles,Blue World,"January 17, 2020","[Mac Miller, Guy Lawrence, Robert Craig Wright...",[Intro: The Four Freshman]\nIt's a blue world ...,blue world,[It's a blue world without you It's a blue wor...
3,Mac Miller,Circles,Good News,"January 9, 2020","[Mac Miller, Jon Brion]",[Verse 1]\nI spent the whole day in my head\nD...,good news,[I spent the whole day in my head Do a little ...
4,Mac Miller,Circles,I Can See,"January 17, 2020","[Mac Miller, Shea Taylor]","[Verse 1]\nI'm so close, I can taste it\nThe m...",i can see,"[I'm so close, I can taste it The man on the m..."


Getting lyrics that was written by the artist. First will get who sang the verse and will then keep only the verses they were a part of

{'2Pac',
 'A Tribe Called Quest',
 'Big L',
 'Drake',
 'J. Cole',
 'Jay-Z',
 'Joey Bada$$',
 'Kanye West',
 'Kendrick Lamar',
 'Mac Miller',
 'Quentin Miller',
 'Soulja Boy',
 'Tyler, The Creator'}

In [202]:
group_members = {
    "A Tribe Called Quest": ["Q-Tip", "Phife Dawg", "Ali Shaheed Muhammad"]
}

def get_main_artist_verses(lyrics, main_artist, group_members=group_members):
    """
    Extract only the verses written/performed by the main artist.
    """
    # Split lyrics into blocks: headers like [Verse 1] or [Intro: Artist] and the text
    blocks = re.split(r'(\[[^\]]+\])', lyrics)  # keep headers
    
    verses = []
    
    allowed_artists = group_members.get(main_artist, [main_artist]) if group_members else [main_artist]
    allowed_artists = [a.lower() for a in allowed_artists]
    
    for i in range(1, len(blocks), 2):
        header = blocks[i]
        text = blocks[i+1] if i+1 < len(blocks) else ""
        
        # Try to extract artists from the header
        match = re.search(r'\[(?:[^\]:]*?:\s*)([^\]]+)\]', header)
        if match:
            artists = [a.strip().lower() for a in re.split(r'\s*(?:&|and)\s*', match.group(1))]
        else:
            # No artist in header → assume main artist
            artists = allowed_artists
        
        if any(a in allowed_artists for a in artists):
            verses.append(text.strip())
    
    return verses

# Apply to entire DataFrame
hip['main_artist_lyrics'] = hip.apply(
    lambda row: get_main_artist_verses(row['lyrics'], row['artist']),
    axis=1
)

# Join the verses back into a single string per song
hip['main_artist_lyrics_joined'] = hip['main_artist_lyrics'].apply(lambda x: "\n".join(x))

hip.head()

Unnamed: 0,artist,album,song_title,date,writers,lyrics,title_clean,cleaned_lyrics,main_artist_lyrics,main_artist_lyrics_joined,pot_ghost,pot_ghost_name
0,Mac Miller,Circles,Circles,"January 17, 2020",[Mac Miller],"[Verse 1]\nWell, this is what it look like rig...",circles,"[Well, this is what it look like right before ...","[Well, this is what it look like right before ...","Well, this is what it look like right before y...",0.0,
1,Mac Miller,Circles,Complicated,"January 17, 2020","[Mac Miller, Jon Brion]","[Verse 1]\nOutside is cloudy, but I like that ...",complicated,"[Outside is cloudy, but I like that better (Be...","[Outside is cloudy, but I like that better (Be...","Outside is cloudy, but I like that better (Bet...",0.0,
2,Mac Miller,Circles,Blue World,"January 17, 2020","[Mac Miller, Guy Lawrence, Robert Craig Wright...",[Intro: The Four Freshman]\nIt's a blue world ...,blue world,[It's a blue world without you It's a blue wor...,"[Yeah, well, this a mad world, it made me craz...","Yeah, well, this a mad world, it made me crazy...",0.0,
3,Mac Miller,Circles,Good News,"January 9, 2020","[Mac Miller, Jon Brion]",[Verse 1]\nI spent the whole day in my head\nD...,good news,[I spent the whole day in my head Do a little ...,[I spent the whole day in my head\nDo a little...,I spent the whole day in my head\nDo a little ...,0.0,
4,Mac Miller,Circles,I Can See,"January 17, 2020","[Mac Miller, Shea Taylor]","[Verse 1]\nI'm so close, I can taste it\nThe m...",i can see,"[I'm so close, I can taste it The man on the m...","[I'm so close, I can taste it\nThe man on the ...","I'm so close, I can taste it\nThe man on the m...",0.0,


In [203]:
def flag_missing_header_artists(lyrics, main_artist, writers, corpus_artists, group_members=None):
    """
    Returns 1 if the lyric header does NOT include the main artist,
    and also does NOT include any other contributing artists from our corpus.
    """
    # Determine allowed names for the main artist
    allowed_artists = group_members.get(main_artist, [main_artist]) if group_members else [main_artist]
    allowed_artists = [a.lower() for a in allowed_artists]
    
    # Get all lyric headers in the song
    headers = re.findall(r'\[(?:[^\]:]*?:\s*)([^\]]+)\]', lyrics)
    
    # Split header artists by & / and, flatten all headers
    header_artists = []
    for h in headers:
        header_artists.extend([a.strip().lower() for a in re.split(r'\s*(?:&|and)\s*', h)])
    
    # Check if main artist is in any header
    main_in_header = any(a in allowed_artists for a in header_artists)
    
    # Get all other contributing artists from writers who are in our corpus
    other_contributors = [w for w in writers if w in corpus_artists and w != main_artist]
    
    # Check if any of these contributors appear in the headers
    contributor_in_header = any(a in other_contributors for a in header_artists)
    
    # Flag = 1 if main artist not in header and none of the other contributors are in header
    if not main_in_header and not contributor_in_header and len(other_contributors) > 0:
        return 1, other_contributors
    return 0, None


In [204]:
all_artists = hip['artist'].unique().tolist()

hip[['pot_ghost', 'pot_ghost_name']] = hip.apply(
    lambda row: pd.Series(flag_missing_header_artists(
        row['lyrics'],
        row['artist'],
        row['writers'],
        all_artists,
        group_members
    )),
    axis=1
)
hip.head()

Unnamed: 0,artist,album,song_title,date,writers,lyrics,title_clean,cleaned_lyrics,main_artist_lyrics,main_artist_lyrics_joined,pot_ghost,pot_ghost_name
0,Mac Miller,Circles,Circles,"January 17, 2020",[Mac Miller],"[Verse 1]\nWell, this is what it look like rig...",circles,"[Well, this is what it look like right before ...","[Well, this is what it look like right before ...","Well, this is what it look like right before y...",0.0,
1,Mac Miller,Circles,Complicated,"January 17, 2020","[Mac Miller, Jon Brion]","[Verse 1]\nOutside is cloudy, but I like that ...",complicated,"[Outside is cloudy, but I like that better (Be...","[Outside is cloudy, but I like that better (Be...","Outside is cloudy, but I like that better (Bet...",0.0,
2,Mac Miller,Circles,Blue World,"January 17, 2020","[Mac Miller, Guy Lawrence, Robert Craig Wright...",[Intro: The Four Freshman]\nIt's a blue world ...,blue world,[It's a blue world without you It's a blue wor...,"[Yeah, well, this a mad world, it made me craz...","Yeah, well, this a mad world, it made me crazy...",0.0,
3,Mac Miller,Circles,Good News,"January 9, 2020","[Mac Miller, Jon Brion]",[Verse 1]\nI spent the whole day in my head\nD...,good news,[I spent the whole day in my head Do a little ...,[I spent the whole day in my head\nDo a little...,I spent the whole day in my head\nDo a little ...,0.0,
4,Mac Miller,Circles,I Can See,"January 17, 2020","[Mac Miller, Shea Taylor]","[Verse 1]\nI'm so close, I can taste it\nThe m...",i can see,"[I'm so close, I can taste it The man on the m...","[I'm so close, I can taste it\nThe man on the ...","I'm so close, I can taste it\nThe man on the m...",0.0,


In [205]:
hip[hip['pot_ghost'] == 1]

Unnamed: 0,artist,album,song_title,date,writers,lyrics,title_clean,cleaned_lyrics,main_artist_lyrics,main_artist_lyrics_joined,pot_ghost,pot_ghost_name
13,Mac Miller,Swimming,Hurt Feelings,"August 3, 2018","[Mac Miller, J. Cole, Jon Brion, Devonté Hynes...",[Intro]\nYeah\nWoah-woah-woah-woah-woah-woah-w...,hurt feelings,[Yeah Woah-woah-woah-woah-woah-woah-woah-woah ...,[Yeah\nWoah-woah-woah-woah-woah-woah-woah-woah...,Yeah\nWoah-woah-woah-woah-woah-woah-woah-woah ...,1.0,[J. Cole]
316,Drake,Take Care,Buried Alive Interlude,"November 15, 2011","[Kendrick Lamar, Supa Dups, 40]",[Intro: Kendrick Lamar]\nBox\nI would surely b...,buried alive interlude,[Box I would surely break the lock I'd jump ri...,[],,1.0,[Kendrick Lamar]
397,J. Cole,Cole World: The Sideline Story,Work Out,"June 15, 2011","[Elliot Wolff, John Legend, Miri Ben-Ari - מיר...","[Intro]\nI want to see you work out for me, wo...",work out,"[I want to see you work out for me, work out f...","[I want to see you work out for me, work out f...","I want to see you work out for me, work out fo...",1.0,[Kanye West]
531,Kanye West,The Life of Pablo,Facts (Charlie Heat Version),"December 31, 2015","[Kanye West, Charlie Heat, Metro Boomin, South...",[Intro]\nDirt and grime and filth inside\nThe ...,facts,[Dirt and grime and filth inside The story of ...,[Dirt and grime and filth inside\nThe story of...,Dirt and grime and filth inside\nThe story of ...,1.0,[Drake]
743,Jay-Z,Watch the Throne,Murder to Excellence,"August 8, 2011","[JAY-Z, Kanye West, Swizz Beatz, Mihaela Modor...",[Produced by S1 and Swizz Beatz]\n\n[Part I: M...,murder to excellence,"[Uh, bloody murder, murder, murder Bloody murd...",[],,1.0,[Kanye West]
746,Jay-Z,Watch the Throne,Illest Motherfucker Alive,"August 8, 2011","[JAY-Z, Kanye West, MIKE DEAN, Southside, Kid ...",[Three minutes of silence]\n\n[Intro: Kanye We...,illest motherfucker alive,"[Uh-oh, damn, uh-oh 1985 white Lamborghini Cou...","[, ]",\n,1.0,[Kanye West]
748,Jay-Z,Watch the Throne,Primetime,"August 8, 2011","[JAY-Z, Kanye West, No I.D., Maureen Reid, Lar...",[Produced by No I.D.]\n\n[Intro: Jay Z]\nWe in...,primetime,"[We in the time of our lives, baby Turn the mu...",[],,1.0,[Kanye West]
751,Jay-Z,The Blueprint 3,Thank You,"September 8, 2009","[JAY-Z, Kanye West, No I.D., Marcos Valle]","[Intro]\nThank you, thank you very much for co...",thank you,"[Thank you, thank you very much for coming out...","[Thank you, thank you very much for coming out...","Thank you, thank you very much for coming out ...",1.0,[Kanye West]
768,Jay-Z,The Black Album,Encore,"November 14, 2003","[JAY-Z, Kanye West, Lennon-McCartney, John Len...",[Produced by Kanye West]\n\n[Intro]\nThank you...,encore,"[Thank you, thank you, thank you! You're far t...","[, Thank you, thank you, thank you!\nYou're fa...","\nThank you, thank you, thank you!\nYou're far...",1.0,[Kanye West]
776,Jay-Z,The Black Album,Lucifer,"November 14, 2003","[JAY-Z, Kanye West, Mamie Smith, Oliver Hugh P...",[Produced by Kanye West]\n\n[Intro]\nKanyeezy ...,lucifer,"[Kanyeezy you did it again, you a genius, nigg...","[, Kanyeezy you did it again, you a genius, ni...","\nKanyeezy you did it again, you a genius, nig...",1.0,[Kanye West]


In [138]:
re.findall(r'\[(?:[^\]]*?:\s*)([^\]]+)\]',hip['lyrics'][100])

['Bēkon & Kid Capri',
 'Kendrick Lamar',
 'Kendrick Lamar',
 'Bono',
 'Kendrick Lamar',
 'Bono']

In [97]:
re.split(r'and|&', t, flags=re.IGNORECASE)

['Kendrick Lamar']

In [82]:
re.split(r'\[.*?\]', hip['lyrics'][100])

['',
 '\n\n',
 "\nAmerica\nGod bless you if it's good to you\nAmerica, please take my hand\nCan you help me underst—\nNew Kung Fu Kenny\n\n",
 '\nThrow a steak off the ark to a pool full of sharks, he\'ll take it\nLeave him in the wilderness with a sworn nemesis, he\'ll make it (He\'ll make it)\nTake the gratitude from him, I bet he\'ll show you somethin\', woah (Woah)\nI chip a nigga lil\' bit of nothin\'\nI chip a nigga lil\' bit of nothin\'\nI chip a nigga lil\' bit of nothin\'\nI chip a nigga, then throw the blower in his lap\nWalk myself to the court like, "Bitch, I did that X-rated"\nJohnny don\'t wanna go to school no more, no more\nJohnny said books ain\'t cool no more (No more)\nJohnny wanna be a rapper like his big cousin\nJohnny caught a body yesterday out hustlin\'\nGod bless America, you know we all love him\nYesterday, I got a call, like, from my dog, like 101\nSaid they killed his only son because of insufficient funds\nHe was sobbin\', he was mobbin\', way belligerent a