## Ghost Writer Detector



In [None]:
from lyricsgenius import Genius
import pandas as pd
import time

# -----------------------------
# CONFIGURATION
# -----------------------------
with open("genius_token.txt", "r") as f:
    token = f.read().strip()

genius = Genius(token, timeout=20, sleep_time=3, retries=5)

# List of artists
artists = ["Mac Miller", "Kendrick Lamar", "Quentin Miller", "Drake", "J. Cole", "A Tribe Called Quest", "Kanye West", "Soulja Boy","Jay-Z", "Big L", "Tyler, The Creator", "2Pac", "Joey Bada$$"]

# Optionally: limit to certain albums for each artist
# Leave list empty [] to pull all albums
target_albums = {
    "Mac Miller": ["Swimming", "Circles", "K.I.D.S.", "The Divine Feminine", "Faces"],
    "Kendrick Lamar": ["DAMN.", "good kid, m.A.A.d city", "To Pimp a Butterfly", "Mr. Morale & the Big Steppers", "Section.80"],
    "Quentin Miller": ["Essentials, Vol. 2", "Q.M.", "Hey! Thanks a Lot 3", "na fr.", "Falco", "X.X."],
    "Drake": ["So Far Gone", "Take Care", "Nothing Was the Same", "If You're Reading This It's Too Late", "More Life", "Scorpion", "Certified Lover Boy"],
    "J. Cole": ["The Warm Up", "Friday Night Lights", "Cole World: The Sideline Story", "2014 Forest Hills Drive", "4 Your Eyez Only", "KOD"],
    "A Tribe Called Quest": ["People’s Instinctive Travels and the Paths of Rhythm", "The Low End Theory", "Midnight Marauders", "The Anthology","We got it from Here... Thank You 4 Your service","The Lost Tribes"],
    "Kanye West": ["The College Dropout", "Late Registration", "Graduation", "808s & Heartbreak", "My Beautiful Dark Twisted Fantasy", "The Life of Pablo"],
    "Soulja Boy": ["Pretty Boy Millionaires", "Greatest Entertainer Alive", "Tell Em Tv", "iSouljaBoyTellem","The DeAndre Way"],
    "Jay-Z": ["Reasonable Doubt", "The Blueprint", "The Black Album", "The Blueprint 3", "Watch the Throne", "4:44"],
    "Big L": ["Now or Never", "The Danger Zone", "Return of the Devil's Son", "139 & Lenox", "Lifestylez Ov Da Poor & Dangerous"],
    "Tyler, The Creator": ["Bastard", "Goblin", "Flower Boy", "IGOR", "CALL ME IF YOU GET LOST: The Estate Sale", "CHROMAKOPIA"],
    "2Pac": ["Greatest Hits", "I Ain't Mad At 'Cha", "All Eyez On Me", "Strictly 4 My N.I.G.G.A.Z...", "Me Against The World", "So Many Tears"],
    "Joey Bada$$": ["1999", "B4.DA.$$", "ALL-AMERIKKKAN BADA$$", "The Light Pack", "2000", "Rejex"]
}

# -----------------------------
# FUNCTION DEFINITIONS
# -----------------------------

def get_artist_id(artist_name):
    """Search Genius API for an artist's ID."""
    try:
        search = genius.search_artists(artist_name)
        return search["sections"][0]["hits"][0]["result"]["id"]
    except Exception as e:
        print(f"⚠️ Could not find artist ID for {artist_name}: {e}")
        return None


def get_albums_by_artist(artist_id):
    """Return all albums for a given artist ID."""
    all_albums = []
    page = 1
    while True:
        response = genius.artist_albums(artist_id, page=page)
        albums_page = response.get("albums", [])
        if not albums_page:
            break
        all_albums.extend(albums_page)
        page += 1
    return [
        {"album_name": a["name"], "album_id": a["id"], "url": a["url"]}
        for a in all_albums
    ]


def get_album_tracks(album_id):
    """Fetch all tracks for a Genius album safely."""
    all_tracks = []
    page = 1
    while True:
        try:
            response = genius._make_request(
                f"albums/{album_id}/tracks",
                params_={"page": page},
                public_api=True
            )
        except AssertionError as e:
            if "403" in str(e):
                print(f"🚫 Forbidden: no access to album {album_id}")
            elif "404" in str(e):
                print(f"❌ Album {album_id} not found")
            else:
                print(f"⚠️ Unexpected error fetching album {album_id}: {e}")
            break  # stop paging if request fails

        # safely extract tracks
        page_tracks = response.get("tracks", [])
        if not page_tracks:
            break
        all_tracks.extend(page_tracks)
        page += 1
    return [
        {"song_id": t["song"]["id"], "song_title": t["song"]["title"], "date": t["song"]["release_date_for_display"],"url": t["song"]["url"]}
        for t in all_tracks
    ]


def get_lyrics(url):
    """Fetch lyrics safely."""
    try:
        return genius.lyrics(song_url=url)
    except AssertionError as e:
        if "403" in str(e):
            print(f"🚫 Lyrics not accessible for {url}")
        else:
            print(f"⚠️ AssertionError fetching lyrics: {e}")
        return None
    except Exception as e:
        print(f"⚠️ Error fetching lyrics from {url}: {e}")
        return None
    
def safe_get_writers(songid, artist_name):
    """Safely get writer credits for a Genius song ID."""
    try:
        song_data = genius.song(songid)
        if song_data and 'song' in song_data:
            writers_cred = song_data['song'].get('writer_artists', [])
            return [w['name'] for w in writers_cred] if writers_cred else [artist_name]
        else:
            return [artist_name]
    except AssertionError as e:
        if "403" in str(e):
            print(f"🚫 Skipping song {songid} — 403 Forbidden (private/unreleased)")
        else:
            print(f"⚠️ AssertionError on song {songid}: {e}")
        return [artist_name]
    except Exception as e:
        print(f"⚠️ Unexpected error on song {songid}: {e}")
        return [artist_name]



# -----------------------------
# MAIN LOOP
# -----------------------------

all_results = []
seen_tracks = set()

for artist_name in artists:
    print(f"\n🎤 Processing artist: {artist_name}")
    artist_id = get_artist_id(artist_name)
    if artist_id is None:
        continue

    albums = get_albums_by_artist(artist_id)
    if not albums:
        print(f"  ⚠️ No albums found for {artist_name}")
        continue

    df_albums = pd.DataFrame(albums)

    # Filter if target albums specified
    if target_albums.get(artist_name):
        df_albums = df_albums[df_albums["album_name"].isin(target_albums[artist_name])]

    for _, album_row in df_albums.iterrows():
        album_name = album_row["album_name"]
        album_id = album_row["album_id"]
        print(f"  💿 Getting tracks for album: {album_name}")

        tracks = get_album_tracks(album_id)
        for t in tracks:
            songid = t['song_id']
            if songid not in seen_tracks:
                seen_tracks.add(songid)
                print(f"     🎶 {t['song_title']}")
                lyrics = get_lyrics(t["url"])
                
                writers = safe_get_writers(songid, artist_name)
                
                all_results.append({
                    "artist": artist_name,
                    "album": album_name,
                    "song_title": t["song_title"],
                    "date": t['date'],
                    "writers": writers,
                    "lyrics": lyrics
                })
                time.sleep(1)  # be nice to the API
                
for artist_name in artists:
    print(f"\n🎤 Processing artist: {artist_name}")
    artist_id = get_artist_id(artist_name)
    if artist_id is None:
        continue
    newest_50 = genius.artist_songs(artist_id, sort='release_date', per_page=50)
    print(f"  🚀 Getting latest songs from: {artist_name}")
    for t in newest_50['songs']:
        songid = t['id']
        if songid not in seen_tracks:
            seen_tracks.add(songid)
            s = genius.song(songid)
            if s['song']['album'] is not None:
                album_name = s['song']['album']['name']
            else: 
                album_name = None
            print(f"     🎶 {t['title']}")
            lyrics = get_lyrics(t["url"])
            
            writers = safe_get_writers(songid, artist_name)
            
            all_results.append({
                "artist": artist_name,
                "album": album_name,
                "song_title": t["title"],
                "date": t["release_date_for_display"],
                "writers": writers,
                "lyrics": lyrics
            })
            time.sleep(1)
    popular_50 = genius.artist_songs(artist_id, sort='popularity', per_page=50)
    print(f"  🔥 Getting most popular tracks from: {artist_name}")
    for t in newest_50['songs']:
        songid = t['id']
        if songid not in seen_tracks:
            seen_tracks.add(songid)
            s = genius.song(songid)
            if s['song']['album'] is not None:
                album_name = s['song']['album']['name']
            else: 
                album_name = None
            print(f"     🎶 {t['song_title']}")
            lyrics = get_lyrics(t["url"])
            
            writers = safe_get_writers(songid, artist_name)
            
            all_results.append({
                "artist": artist_name,
                "album": album_name,
                "song_title": t["title"],
                "date": t["release_date_for_display"],
                "writers": writers,
                "lyrics": lyrics
            })
            time.sleep(1)

# -----------------------------
# SAVE RESULTS
# -----------------------------
df = pd.DataFrame(all_results)
df.head()
print("\n✅ Done!'")



🎤 Processing artist: Mac Miller
  💿 Getting tracks for album: Circles
     🎶 Circles
     🎶 Complicated
     🎶 Blue World
     🎶 Good News
     🎶 I Can See
     🎶 Everybody
     🎶 Woods
     🎶 Hand Me Downs
     🎶 That’s on Me
     🎶 Hands
     🎶 Surf
     🎶 Once a Day
  💿 Getting tracks for album: Swimming
     🎶 Come Back to Earth
     🎶 Hurt Feelings
     🎶 What’s the Use?
     🎶 Perfecto
     🎶 Self Care
     🎶 Wings
     🎶 Ladders
     🎶 Small Worlds
     🎶 Conversation, Pt. 1
     🎶 Dunno
     🎶 Jet Fuel
     🎶 2009
     🎶 So It Goes
  💿 Getting tracks for album: The Divine Feminine
     🎶 Congratulations
     🎶 Dang!
     🎶 Stay
     🎶 Skin
     🎶 Cinderella
     🎶 Planet God Damn
     🎶 Soulmate
     🎶 We
     🎶 My Favorite Part
     🎶 God Is Fair, Sexy Nasty
  💿 Getting tracks for album: Faces
     🎶 Inside Outside
     🎶 Here We Go
     🎶 Friends
     🎶 Angel Dust
     🎶 Malibu
     🎶 What Do You Do
     🎶 It Just Doesn’t Matter
     🎶 Therapy
     🎶 Polo Jeans
     🎶 Happy 

In [294]:
df['artist'].value_counts()

artist
Drake                   173
Kanye West              141
Jay-Z                   134
J. Cole                 129
Tyler, The Creator      128
Mac Miller              124
2Pac                    124
Joey Bada$$             123
A Tribe Called Quest    113
Kendrick Lamar          107
Quentin Miller          105
Soulja Boy               91
Big L                    83
Name: count, dtype: int64

In [324]:
filtered = df[df['lyrics'].notna()]

filtered['artist'].value_counts()

artist
Drake                   172
Kanye West              132
Jay-Z                   130
J. Cole                 129
Tyler, The Creator      127
2Pac                    121
Mac Miller              120
Joey Bada$$             119
A Tribe Called Quest    111
Kendrick Lamar           94
Quentin Miller           89
Big L                    77
Soulja Boy               56
Name: count, dtype: int64

In [None]:
supp_tar_albums = {
    "Kendrick Lamar": ["untitled unmastered.", "Black Panther: The Album"],
    "Quentin Miller": ["Hey! Thanks a Lot 2", "Shredded Metal"],
    "Big L": ["Devil’s Son EP (From The Vaults)", "Harlem’s Finest: Return of the King"],
    "Childern of the Corn": ["Children of the Corn: Collector’s Edition", "Welcome to the Dangerzone"],
    "Soulja Boy": ["Supaman", "Unsigned and Still Major: Da Album Before Da Album", "souljaboytellem.com", "The Teen of the South", "P.B.M. Pretty Boy Millionaires (Streaming Version)", "King Soulja 3", "Successful"]
}

supp_artists = list(supp_tar_albums.keys())

for artist in supp_artists:
    artist_name = artist
    if artist == "Childern of the Corn":
        artist_name = "Big L"
    print(f"\n🎤 Processing artist: {artist_name}")
    artist_id = get_artist_id(artist)
    if artist_id is None:
        continue
    
    albums = get_albums_by_artist(artist_id)
    if not albums:
        print(f"  ⚠️ No albums found for {artist_name}")
        continue
    
    df_albums = pd.DataFrame(albums)

    # Filter if target albums specified
    if supp_tar_albums.get(artist):
        df_albums = df_albums[df_albums["album_name"].isin(supp_tar_albums[artist])]

    for _, album_row in df_albums.iterrows():
        album_name = album_row["album_name"]
        album_id = album_row["album_id"]
        print(f"  💿 Getting tracks for album: {album_name}")

        tracks = get_album_tracks(album_id)
        for t in tracks:
            songid = t['song_id']
            if songid not in seen_tracks:
                seen_tracks.add(songid)
                print(f"     🎶 {t['song_title']}")
                lyrics = get_lyrics(t["url"])
                
                writers = safe_get_writers(songid, artist)
                
                all_results.append({
                    "artist": artist_name,
                    "album": album_name,
                    "song_title": t["song_title"],
                    "date": t['date'],
                    "writers": writers,
                    "lyrics": lyrics
                })
                time.sleep(1)
print("\n✅ Done!'")


🎤 Processing artist: Kendrick Lamar
  💿 Getting tracks for album: Black Panther: The Album
  💿 Getting tracks for album: untitled unmastered.

🎤 Processing artist: Quentin Miller
  💿 Getting tracks for album: Shredded Metal
  💿 Getting tracks for album: Hey! Thanks a Lot 2

🎤 Processing artist: Big L
  💿 Getting tracks for album: Harlem’s Finest: Return of the King
  💿 Getting tracks for album: Devil’s Son EP (From The Vaults)

🎤 Processing artist: Big L
  💿 Getting tracks for album: Welcome to the Dangerzone
     🎶 Harlem U.S.A. (Harlem Version)
     🎶 American Dream
     🎶 Harlem Nights
     🎶 Fair One, Part 1


In [338]:
df['artist'].value_counts()

artist
Drake                   173
Kanye West              141
Jay-Z                   134
J. Cole                 129
Tyler, The Creator      128
Mac Miller              124
2Pac                    124
Joey Bada$$             123
A Tribe Called Quest    113
Kendrick Lamar          107
Quentin Miller          105
Soulja Boy               91
Big L                    83
Name: count, dtype: int64