In [None]:
import pandas as pd
import re
df = pd.read_csv('../data/lyrics_raw.csv', index_col=0)

In [None]:
# function to clean text and extract section headers given dataframe with 'lyrics' column
def clean_lyrics(df):
    lyrics_text = []
    song_sections = []
    for lyrics in df['lyrics']:
        # extract section headers in brackets and create list of unique values
        sections = re.findall(r'\[.*?\]', lyrics)
        sections = [re.sub(r'[^a-zA-Z\s]', '', section) for section in sections]
        sections = list(set(word for section in sections for word in section.split()))
        song_sections.append(sections)
                # remove first line header and 'embed' text
        lyrics = re.sub(r'^.*?Lyrics', '', lyrics)
        lyrics = re.sub(r'\d*Embed\b', '', lyrics)
        # remove any line containing brackets (all headers + surrounding text)
        lyrics = re.sub(r'.*\[.*\].*\n?', '', lyrics)
        lyrics_text.append(lyrics)
    df['lyrics_text'] = lyrics_text
    df['song_sections'] = song_sections
    return df

In [None]:
clean_lyrics(df)

In [None]:
# example bad lyrics data
print(df['lyrics_text'][2])
print(df['lyrics_text'][4])

In [None]:
# find most common song sections
section_counts = df['song_sections'].explode().value_counts()
section_counts.head(20)

In [None]:
# create list of most common section titles
song_sections = ['Verse', 'Chorus', 'Bridge', 'Outro', 'Intro', 'PreChorus', 'PostChorus', 'Instrumental', 'Break', 'Solo', 'Refrain', 'Interlude', 'Hook']

# inspect rows to be dropped
rows_to_drop = df[~df['song_sections'].apply(lambda x: any(section in x for section in song_sections) if x else True)].copy()
rows_to_drop['lyrics_text'].head(20)

In [None]:
# drop rows without common section titles
df = df[df['song_sections'].apply(lambda x: any(section in x for section in song_sections) if x else False)].copy()

# inspect remaining rows
df['lyrics_text'].head(20)

In [None]:
df.to_csv('../data/lyrics.csv', index=False)