In [2]:
import pandas as pd
from lyricsgenius import Genius
import os
import re
from getting_lyrics1 import (title_similarity_check, 
                             artist_similarity_check, 
                             lyric_modification)
from time import sleep

In [19]:
# convert each file into a single DataFrame and csv file

df_ranking = pd.read_csv("data/ranking.csv")

data_size = len(df_ranking)
num = data_size // 16
rest = 6989 % 16
idxs = [i for i in range(0, 6989, num)]
idxs[-1] += rest

df_search_error = pd.DataFrame()
df_unreliable_result = pd.DataFrame()
df_lyrics = pd.DataFrame()
for i in range(len(idxs) - 1):
    try:
        df_tmp = pd.read_csv(f"data_search_error_sub/search_error{idxs[i]}-{idxs[i + 1] - 1}.csv")
        df_search_error = pd.concat([df_search_error, df_tmp], axis=0)
    except FileNotFoundError:
        pass
    try:
        df_tmp = pd.read_csv(f"data_unreliable_result_sub/unreliable_result{idxs[i]}-{idxs[i + 1] - 1}.csv")
        df_unreliable_result = pd.concat([df_unreliable_result, df_tmp], axis=0)
    except FileNotFoundError:
        pass
    try:
        df_tmp = pd.read_csv(f"data_lyrics_sub/lyrics{idxs[i]}-{idxs[i + 1] - 1}.csv")
        df_lyrics = pd.concat([df_lyrics, df_tmp], axis=0)
    except FileNotFoundError:
        pass

df_search_error.reset_index(drop=True, inplace=True)
df_unreliable_result.reset_index(drop=True, inplace=True)
df_lyrics.reset_index(drop=True, inplace=True)

df_search_error.to_csv("data/search_error.csv", index=False)
df_unreliable_result.to_csv("data/unreliable_result.csv", index=False)
df_lyrics.to_csv("data/lyrics.csv", index=False)

In [16]:
# search search-error songs again

genius = Genius(access_token=os.environ["GeniusAPIClientAccessToken"], 
                verbose=True, 
                remove_section_headers=True, 
                retries=2)

pattern = re.compile("\n+")
r = len(df_search_error)

lyrics = [""] * r

for i in range(r):
    title = df_search_error.iloc[i, 4]
    artist = df_search_error.iloc[i, 5]
    try:
        song = genius.search_song(title, artist, get_full_info=False)
    except:
        try:
            song = genius.search_song(title, artist, get_full_info=False)
        except:
            print("Search failed.")
            lyrics[i] = "search_failed"
            continue
    if song is not None:
        if title_similarity_check(title, song.title) and artist_similarity_check(artist, song.artist):
            lyrics[i] = lyric_modification(song.lyrics, pattern)
        else:
            lyrics[i] = "unreliable"
    else:
        lyrics[i] = "no_lyrics"

df_search_error_lyrics = pd.DataFrame(data=lyrics, columns=["lyrics"])
df_search_error_new_result = pd.concat([df_search_error, df_search_error_lyrics], axis=1)

Searching for "The Wanderer" by Dion...
Done.
Searching for "Bridge Over Troubled Water" by Simon...
Done.
Searching for "I Want You Back" by The Jackson 5...
Done.
Searching for "Y.M.C.A." by Village People...
Done.
Searching for "Never Gonna Give You Up" by Rick Astley...
Done.
Searching for "Barbie Girl" by Aqua...
Searching for "Barbie Girl" by Aqua...
Done.
Searching for "Write This Down" by George Strait...
Done.
Searching for "Say My Name" by Destiny's Child...
Done.
Searching for "Hot in Herre" by Nelly...
Done.
Searching for "Call Me Maybe" by Carly Rae Jepsen...
Done.
Searching for "Lucid Dreams" by Juice Wrld...
Done.


In [20]:
df_search_error_new_result

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
0,1962,12,"""The Wanderer""",Dion,The Wanderer,Dion,"Oh well, I'm the type of guy who will never se..."
1,1970,1,"""Bridge Over Troubled Water""",Simon & Garfunkel,Bridge Over Troubled Water,Simon,When you're weary|Feeling small|When tears are...
2,1970,28,"""I Want You Back""",The Jackson 5,I Want You Back,The Jackson 5,Uh-huh huh huh huh|Let me tell ya now|Uh-huh|W...
3,1979,8,"""Y.M.C.A.""",Village People,Y.M.C.A.,Village People,"Young man, there's no need to feel down, I sai..."
4,1988,4,"""Never Gonna Give You Up""",Rick Astley,Never Gonna Give You Up,Rick Astley,Desert you|Ooh-ooh-ooh-ooh|Hurt you|We're no s...
5,1997,94,"""Barbie Girl""",Aqua,Barbie Girl,Aqua,"Hiya, Barbie|Hi, Ken|You wanna go for a ride?|..."
6,1999,98,"""Write This Down""",George Strait,Write This Down,George Strait,I never saw the end in sight|Fools are kind of...
7,2000,6,"""Say My Name""",Destiny's Child,Say My Name,Destiny's Child,"(Darkchild '99)|Say my name, say my name|If no..."
8,2002,3,"""Hot in Herre""",Nelly,Hot in Herre,Nelly,"It's hot in, so hot in here|So hot in... (Ah)|..."
9,2012,2,"""Call Me Maybe""",Carly Rae Jepsen,Call Me Maybe,Carly Rae Jepsen,"I threw a wish in the well|Don't ask me, I'll ..."


In [37]:
# replace nan with new search results

r = len(df_search_error_new_result)
for i in range(r):
    year = df_search_error_new_result["year"][i]
    rank = df_search_error_new_result["rank"][i]
    lyrics = df_search_error_new_result["lyrics"][i]
    idx = df_lyrics.query(f'year == {year} and rank == {rank}').index
    if pd.isnull(df_lyrics.iloc[idx, 6].item()):
        df_lyrics.iloc[idx, 6] = lyrics
    else:
        print(f"{year}, {rank}")

In [None]:
# check unreliable results by hands

r = len(df_unreliable_result)

need_to_check = []

for i in range(r):
    year = df_unreliable_result.iloc[i, 0]
    rank = df_unreliable_result.iloc[i, 1]
    title = df_unreliable_result.iloc[i, 2]
    genius_title = df_unreliable_result.iloc[i, 3]
    artist = df_unreliable_result.iloc[i, 4]
    genius_artist = df_unreliable_result.iloc[i, 5]
    lyrics = df_unreliable_result.iloc[i, 6]
    print(f"title: {title}")
    print(f"title: {genius_title}")
    print(f"artist: {artist}")
    print(f"artist: {genius_artist}")
    print("----------------------------------------------------------------")
    sleep(0.5)
    r1 = int(input())
    if r1 == 0:
        idx = df_lyrics.query(f'year == {year} and rank == {rank}').index
        if df_lyrics.iloc[idx, 6].item() != "unreliable" and df_lyrics.iloc[idx, 6].item() != "both_unreliable":
            need_to_check.append([year, rank])
        continue
    else:
        idx = df_lyrics.query(f'year == {year} and rank == {rank}').index
        if df_lyrics.iloc[idx, 6].item() == "unreliable" or df_lyrics.iloc[idx, 6].item() == "both_unreliable":
            df_lyrics.iloc[idx, 6] = lyrics

In [14]:
df_no_lyrics = df_lyrics[df_lyrics["lyrics"] == "no_lyrics"].copy()
df_no_lyrics.reset_index(drop=True, inplace=True)

In [17]:
df_no_lyrics

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
0,1946,27,"""Hey! Ba-Ba-Re-Bop""",Tex Beneke with the Glenn Miller Orchestra,Hey! Ba-Ba-Re-Bop,Tex Beneke,no_lyrics
1,1947,2,"""Peg o' My Heart""",The Harmonicats,Peg o' My Heart,The Harmonicats,no_lyrics
2,1947,7,"""Peg o' My Heart""",The Three Suns,Peg o' My Heart,The Three Suns,no_lyrics
3,1947,15,"""Temptation (Tim-Tayshun)""",Red Ingle and The Natural Seven and Jo Stafford,Temptation (Tim-Tayshun),Red Ingle,no_lyrics
4,1948,1,"""Twelfth Street Rag""",Pee Wee Hunt,Twelfth Street Rag,Pee Wee Hunt,no_lyrics
...,...,...,...,...,...,...,...
103,1996,65,"""Children""",Robert Miles,Children,Robert Miles,no_lyrics
104,1996,66,"""Theme from Mission: Impossible""",Adam Clayton and Larry Mullen,Theme from Mission: Impossible,Adam Clayton,no_lyrics
105,1997,72,"""ESPN Presents The Jock Jam""",Various Artists,ESPN Presents The Jock Jam,Various Artists,no_lyrics
106,1997,82,"""Macarena (Bayside Boys Mix)""",Los del Río,Macarena (Bayside Boys Mix),Los del Río,no_lyrics


In [None]:
# song_search function returns None when a song did not contain lyrics or no results found.
# Search no_lyrics songs again to identify whether None is due to no lyrics or no results.

for i in range(len(df_no_lyrics)):
    year = df_no_lyrics.iloc[i, 0]
    rank = df_no_lyrics.iloc[i, 1]
    modified_title = df_no_lyrics.iloc[i, 4]
    modified_artist = df_no_lyrics.iloc[i, 5]
    try:
        song = genius.search_song(title=modified_title, artist=modified_artist, get_full_info=False)
            
    # When a search failed
    except:
        print(f"search error: {year}, {rank}")
        continue

    # When the search succeeded and a song has lyrics
    if song is not None:

        # When title and artist name of the song matches those of Genius
        if title_similarity_check(modified_title, song.title) and artist_similarity_check(artist, song.artist):
            print("reliable")
                
        # When the song has lyrics but title and artist name of the song does not matche those of Genius
        else:
            print("unreliable")
            
    # When the song did not have lyrics or posssibly no result found
    else:
        print(f"song info: {year}, {rank}")

In [21]:
# no result songs

no_result = [[1946, 27],
             [1947, 15],
             [1948, 33],
             [1948, 36],
             [1949, 25],
             [1949, 27],
             [1949, 29],
             [1950, 11],
             [1952, 15],
             [1952, 25],
             [1953, 27],
             [1956, 11],
             [1956, 31],
             [1958, 6],
             [1959, 71],
             [1962, 39],
             [1963, 91],
             [1966, 95],
             [1972, 51],
             [1981, 11],
             [1983, 21],
             [1996, 1],
             [1997, 72],
             [1997, 82]]

for year, rank in no_result:
    idx = df_lyrics.query(f'year == {year} and rank == {rank}').index
    if df_lyrics.iloc[idx, 6].item() == "no_lyrics":
        df_lyrics.iloc[idx, 6] = "no_results_found"

In [27]:
df_lyrics[df_lyrics["lyrics"] == "no_lyrics"]

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
42,1947,2,"""Peg o' My Heart""",The Harmonicats,Peg o' My Heart,The Harmonicats,no_lyrics
47,1947,7,"""Peg o' My Heart""",The Three Suns,Peg o' My Heart,The Three Suns,no_lyrics
88,1948,1,"""Twelfth Street Rag""",Pee Wee Hunt,Twelfth Street Rag,Pee Wee Hunt,no_lyrics
210,1951,24,"""Down Yonder""",Del Wood,Down Yonder,Del Wood,no_lyrics
217,1952,1,"""Blue Tango""",Leroy Anderson,Blue Tango,Leroy Anderson,no_lyrics
...,...,...,...,...,...,...,...
3343,1987,55,"""Songbird""",Kenny G,Songbird,Kenny G,no_lyrics
3961,1993,73,"""Forever in Love""",Kenny G,Forever in Love,Kenny G,no_lyrics
4253,1996,65,"""Children""",Robert Miles,Children,Robert Miles,no_lyrics
4254,1996,66,"""Theme from Mission: Impossible""",Adam Clayton and Larry Mullen,Theme from Mission: Impossible,Adam Clayton,no_lyrics


In [26]:
df_lyrics[df_lyrics["lyrics"] == "no_results_found"]

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
26,1946,27,"""Hey! Ba-Ba-Re-Bop""",Tex Beneke with the Glenn Miller Orchestra,Hey! Ba-Ba-Re-Bop,Tex Beneke,no_results_found
55,1947,15,"""Temptation (Tim-Tayshun)""",Red Ingle and The Natural Seven and Jo Stafford,Temptation (Tim-Tayshun),Red Ingle,no_results_found
120,1948,33,"""(I'd Like to Get You on a) Slow Boat to China""",Kay Kyser,(I'd Like to Get You on a) Slow Boat to China,Kay Kyser,no_results_found
123,1948,36,"""The Dicky-Bird Song""",Freddy Martin,The Dicky-Bird Song,Freddy Martin,no_results_found
151,1949,25,"""Whispering Hope""",Jo Stafford and Gordon MacRae with Paul Weston,Whispering Hope,Jo Stafford,no_results_found
153,1949,27,"""Careless Hands""",Sammy Kaye Orchestra,Careless Hands,Sammy Kaye Orchestra,no_results_found
155,1949,29,"""The Hucklebuck""",Tommy Dorsey Orchestra and Charlie Shavers,The Hucklebuck,Tommy Dorsey Orchestra,no_results_found
167,1950,11,"""If I Knew You Were Coming I'd have Baked a Cake""",Eileen Barton with Morty Craft,If I Knew You Were Coming I'd have Baked a Cake,Eileen Barton,no_results_found
231,1952,15,"""Jambalaya (On the Bayou)""",Jo Stafford with Paul Weston and Norman Luboff,Jambalaya (On the Bayou),Jo Stafford,no_results_found
241,1952,25,"""Meet Mister Callaghan""",Les Paul,Meet Mister Callaghan,Les Paul,no_results_found


In [28]:
df_lyrics[df_lyrics["lyrics"] == "unreliable"]

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
5,1946,6,"""Oh! What It Seemed to Be""",Frankie Carle,Oh! What It Seemed to Be,Frankie Carle,unreliable
20,1946,21,"""Symphony""",Bing Crosby,Symphony,Bing Crosby,unreliable
21,1946,22,"""The Gypsy""",Sammy Kaye,The Gypsy,Sammy Kaye,unreliable
24,1946,25,"""Five Minutes More""",Tex Beneke with the Glenn Miller Orchestra,Five Minutes More,Tex Beneke,unreliable
30,1946,31,"""Symphony""",Benny Goodman,Symphony,Benny Goodman,unreliable
...,...,...,...,...,...,...,...
6631,2020,43,"""Death Bed""",Powfu featuring Beabadoobee,Death Bed,Powfu,unreliable
6682,2020,94,"""Pussy Fairy (OTW)""",Jhené Aiko,Pussy Fairy (OTW),Jhené Aiko,unreliable
6711,2021,23,"""My Ex's Best Friend""",Machine Gun Kelly featuring Blackbear,My Ex's Best Friend,Machine Gun Kelly,unreliable
6845,2022,57,"""Hrs and Hrs""",Muni Long,Hrs and Hrs,Muni Long,unreliable


In [29]:
df_lyrics[df_lyrics["lyrics"] == "both_unreliable"]

Unnamed: 0,year,rank,title,artist,modified_title,modified_artist,lyrics
2359,1977,71,"""Star Wars Theme/Cantina Band""",Meco,Star Wars Theme|Cantina Band,Meco,both_unreliable


In [30]:
df_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6989 entries, 0 to 6988
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   year             6989 non-null   int64 
 1   rank             6989 non-null   int64 
 2   title            6989 non-null   object
 3   artist           6989 non-null   object
 4   modified_title   6989 non-null   object
 5   modified_artist  6989 non-null   object
 6   lyrics           6989 non-null   object
dtypes: int64(2), object(5)
memory usage: 382.3+ KB


In [None]:
df_lyrics.to_csv("data/modified_lyrics.csv", index=False)