In [4]:
# *** This is the notebook used for data cleaning trials ***

# Add this line to resolve SSL error while scraping table from pandas read HTML
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [5]:
import pandas as pd

table = pd.read_html('https://en.wikipedia.org/wiki/List_of_songs_by_Taylor_Swift', flavor='html5lib')

In [None]:
df = table[2]

In [7]:
df.drop(columns=["Ref."], inplace=True)
df["Song"] = df["Song"].apply(lambda x: x.replace('"', ''))

In [8]:
from lyricsgenius import Genius
from local import *

genius = Genius(access_token)
genius.timeout = 100

In [21]:
import re

def clean_lyrics(lyrics):
    # Replace special quotes with normal quotes
    lyrics = re.sub(r'\u2018|\u2019', "'", lyrics)
    lyrics = re.sub(r'\u201C|\u201D', '"', lyrics)

    # Replace special unicode spaces with standard space
    lyrics = re.sub(
        r'[\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000]',
        " ", lyrics)
        
    # Replace dashes with space and single hyphen
    lyrics = re.sub(r'\u2013|\u2014', " - ", lyrics)
    
    lyrics_split = lyrics.split("\n")
    cleaned_lyrics_split = []

    for line in lyrics_split:
        # remove comments like "[Verse 1]", "[Chorus]", etc
        if "[" not in line:
            cleaned_lyrics_split.append(line)

    # remove "(xy)Embed" from the last line
    cleaned_lyrics_split[-1] = cleaned_lyrics_split[-1].replace('Embed', '')
    cleaned_lyrics_split[-1] = re.sub(r'[0-9]+', '', cleaned_lyrics_split[-1])

    # remove "You might also like" from the last line
    cleaned_lyrics_split[-1] = cleaned_lyrics_split[-1].replace('You might also like', '')

    cleaned_lyrics = "\n".join(cleaned_lyrics_split)

    return cleaned_lyrics

In [51]:
songs = df["Song"]
artists = df["Artist(s)"]

lyrics_list = []
type_list = []
cleaned_lyrics = ""

for song, artist in zip(songs, artists):
    found_song = genius.search_song(song, artist)
    song_type = "Solo"

    if "cover" in song:
        song_type = "Cover"
        found_song = genius.search_song(song)
    elif "featuring" in artist:
        song_type = "Feature"
        artist_list = artist.split("featuring")
        found_song = genius.search_song(song, artist_list[0])
    elif "and" in artist:
        song_type = "Feature"
        artist_list = artist.split("and")
        found_song = genius.search_song(song, artist_list[0])
    elif artist != "Taylor Swift":
        song_type = "Songwriting Credit"
    type_list.append(song_type)

    if found_song is not None:
        cleaned_lyrics = clean_lyrics(found_song.lyrics)
    lyrics_list.append(cleaned_lyrics)

df["Lyrics"] = lyrics_list
df["Type"] = type_list

Searching for "The 1" by Taylor Swift...
Done.
Searching for "22" by Taylor Swift...
Done.
Searching for "Afterglow" by Taylor Swift...
Done.
Searching for "The Alcott" by The National featuring Taylor Swift...
Done.
Searching for "The Alcott" by The National ...
Done.
Searching for "All of the Girls You Loved Before" by Taylor Swift...
Done.
Searching for "All Too Well" by Taylor Swift...
Done.
Searching for "All Too Well (10 minute version)" by Taylor Swift...
Done.
Searching for "All You Had to Do Was Stay" by Taylor Swift...
Done.
Searching for "American Girl (cover)" by Taylor Swift...
Done.
Searching for "American Girl (cover)"...
Done.
Searching for "Anti-Hero" by Taylor Swift...
Done.
Searching for "The Archer" by Taylor Swift...
Done.
Searching for "August" by Taylor Swift...
Done.
Searching for "Babe" by Sugarland featuring Taylor Swift...
Done.
Searching for "Babe" by Sugarland ...
Done.
Searching for "Babe (Taylor's Version)" by Taylor Swift...
Done.
Searching for "Baby (li

In [54]:
# Print all songs that are covers as they tend to produce bad results
df.loc[df['Type'] == 'Cover']

Unnamed: 0,Song,Artist(s),Writer(s),Album,Year,Lyrics,Type
8,American Girl (cover),Taylor Swift,Tom Petty,Non-album promotional single[b],2009,1 ContributorMy Girlfriend is a Serial Killer ...,Cover
14,"Baby (live cover of Baby, Don't You Break My H...",Taylor Swift,James Newton Howard Vonda Shepard,Napster Live,2006,1 ContributorДискография (Discography) full Ly...,Cover
24,Bette Davis Eyes (live cover),Taylor Swift,Donna Weiss Jackie DeShannon,Speak Now World Tour – Live,2011,There is some unbelievable music that has come...,Cover
35,Breathless (cover),Taylor Swift,Kevin Griffin,Hope for Haiti Now,2010,4 ContributorsShow Me Heaven LyricsThere you g...,Cover
65,Drops of Jupiter (live cover),Taylor Swift,Charlie Colin Rob Hotchkiss Jimmy Stafford ...,Speak Now World Tour – Live,2011,You know... You guys have a lot of amazing ban...,Cover
93,Hold On (live cover),Jack Ingram featuring Taylor Swift,Blu Sanders,Rhapsody Originals[e],2007,1 ContributorSilent Rain ( The Real ) LyricsIn...,Cover
105,I Want You Back (live cover),Taylor Swift,Freddie Perren Deke Richards Berry Gordy Jr. A...,Speak Now World Tour – Live,2011,Ooh baby give me one more chance\n(I'll show y...,Cover
122,Last Christmas (cover),Taylor Swift,George Michael,Sounds of the Season,2007,"3 ContributorsLast Christmas (Cover) LyricsOh,...",Cover
136,Macavity (cover),Taylor Swift and Idris Elba,Andrew Lloyd Webber T. S. Eliot,Cats: Highlights from the Motion Picture Sound...,2019,"\nOkay, good boy\nI will be happy to send you ...",Cover
179,Santa Baby (cover),Taylor Swift,Joan Javits Philip Springer Tony Springer,Sounds of the Season,2007,3 ContributorsSANTA GIRLI (santa baby cover) L...,Cover


In [151]:
# Print all songs with "Contributor" still in the lyrics as they tend to be ones with invalid results
df[df["Lyrics"].str.contains("Contributor")]

Unnamed: 0,Song,Artist(s),Writer(s),Album,Year,Lyrics,Type


In [131]:
# Genius can't find these songs automatically so we have to input the song IDs manually
df.iloc[df[df["Song"] == "American Girl (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=187003).lyrics)
df.iloc[df[df["Song"] == "Baby (live cover of Baby, Don't You Break My Heart Slow)"].index, 5] = clean_lyrics(genius.search_song(song_id=1598390).lyrics)
df.iloc[df[df["Song"] == "Bad Blood (remixed single version)"].index, 5] = clean_lyrics(genius.search_song(song_id=1866151).lyrics)
df.iloc[df[df["Song"] == "Breathless (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=187089).lyrics)
df.iloc[df[df["Song"] == "Don't You"].index, 5] = clean_lyrics(genius.search_song(song_id=4499926).lyrics)
df.iloc[df[df["Song"] == "Half of My Heart (album version)"].index, 5] = clean_lyrics(genius.search_song(song_id=182948).lyrics)
df.iloc[df[df["Song"] == "Hold On (live cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=1406636).lyrics)
df.iloc[df[df["Song"] == "I Bet You Think About Me"].index, 5] = clean_lyrics(genius.search_song(song_id=4499978).lyrics)
df.iloc[df[df["Song"] == "Last Christmas (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=186824).lyrics)
df.iloc[df[df["Song"] == "Long Story Short"].index, 5] = clean_lyrics(genius.search_song(song_id=6260174).lyrics)
df.iloc[df[df["Song"] == "Message in a Bottle"].index, 5] = clean_lyrics(genius.search_song(song_id=7078185).lyrics)
df.iloc[df[df["Song"] == "Mr. Perfectly Fine"].index, 5] = clean_lyrics(genius.search_song(song_id=4499981).lyrics)
df.iloc[df[df["Song"] == "Nothing New"].index, 5] = clean_lyrics(genius.search_song(song_id=4809175).lyrics)
df.iloc[df[df["Song"] == "Santa Baby (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=186846).lyrics)
df.iloc[df[df["Song"] == "September (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=3646014).lyrics)
df.iloc[df[df["Song"] == "Umbrella (live cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=3022978).lyrics)
df.iloc[df[df["Song"] == "The Very First Night"].index, 5] = clean_lyrics(genius.search_song(song_id=7076625).lyrics)
df.iloc[df[df["Song"] == "You'll Always Find Your Way Back Home"].index, 5] = clean_lyrics(genius.search_song(song_id=187340).lyrics)

Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


In [150]:
# These songs contain the contributor... line even after manual correction
df.iloc[df[df["Song"] == "Baby (live cover of Baby, Don't You Break My Heart Slow)"].index, 5] = "\n".join(df[df["Song"] == "Baby (live cover of Baby, Don't You Break My Heart Slow)"].iloc[0,5].split("\n")[1:])
df.iloc[df[df["Song"] == "Umbrella (live cover)"].index, 5] = "\n".join(df[df["Song"] == "Umbrella (live cover)"].iloc[0,5].split("\n")[1:])

In [130]:
# Print all songs that start with new line
df[df["Lyrics"].str.startswith("\n")]

Unnamed: 0,Song,Artist(s),Writer(s),Album,Year,Lyrics,Type


In [88]:
df["Lyrics"] = df["Lyrics"].str.strip("\n")

In [129]:
# These songs need to be searched manually as well
df.iloc[df[df["Song"] == "Macavity (cover)"].index, 5] = clean_lyrics(genius.search_song(song_id=5114093).lyrics)
df.iloc[df[df["Song"] == "Right Where You Left Me"].index, 5] = clean_lyrics(genius.search_song(song_id=6263242).lyrics)

Done.
Done.


In [152]:
df.to_csv("taylor_swift_lyrics.csv")