# Approach 


- Get lyrics from 5 or 6 rappers using the Genius API
- Use the Datamuse API to get words that rhyme with a ample of words from those lyrics
- Get words that don't rhyme with those words also
- Make a dataframe of wordA | wordB | rhyme
- Train model on jupyter collab

In [3]:
import os
import re
import time
import requests
import itertools
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer
load_dotenv()

# Functions 

`get_artist_songs` and `scrape_song_lyrics`

In [2]:
def get_artist_songs(artist_id, access_token=os.getenv("ACCESS_TOKEN")):
    
    # use the genius API to get 15 to 20 song IDs for a given artist
    # we will scrape the lyrics using these IDs later
    url = f"http://api.genius.com/artists/{artist_id}/songs"
    
    token_string = f"Bearer {access_token}"
    headers = {
      "Authorization": token_string
    }

    response = requests.get(url, headers=headers)
    
    return response.json()

In [3]:
def scrape_song_lyrics(song_url_extension):
    
    # get the web page link using the songs route
    url = f"https://genius.com{song_url_extension}"
    
    # get the text from the webpage
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    
    # Eparse HTML
    html = BeautifulSoup(response.text, "html.parser")

    # get all divs from page which contain the lyrics
    lyrics = html.findAll("div", {"class": "lyrics"})
    
    # finally lets just remove any tags thatspecify verses and stuff 
    # e.g. [Verse 1: <ARTIST_NAME>]
    lyrics = re.sub(r"\[.+\]\n", '', lyrics[0].text)
    
    return lyrics.strip()

In [4]:
def get_vocab(corpus):
    
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    vocab = vectorizer.get_feature_names()
    
    # lets just get rid of any short words
    return [word for word in vocab if len(word) > 2 and not word.isdigit()]

In [5]:
def get_rhymes(word):
    
    url = f"https://api.datamuse.com/words?rel_rhy={word.lower()}"
    
    response = requests.get(url)
    
    return response.json()

# Iterate through artists and download song lyrics 

Here we will simply iterate through some artists and download lyerics from a number of their songs. We will use a sample of these lyrics, and words that rhyme with them when we train our model later.

In [4]:
# a dictionary containing the artist name and their genius artist_id
artists = {
    'MF_DOOM': 70,
    'wu_tang': 21,
    'outkast': 105,
    'aesop_rock': 178,
    'biggie': 22,
    'big_l': 103,
    'mos_def': 156,
    'kendrick_lamar': 1421,
    'tribe': 519,
    'talib_kweli': 388
}

In [57]:
# iterate through each artit
for artist in list(artists.keys()):
    # get some songs by that artist
    artist_songs = get_artist_songs(artists[artist])
    print(f"Getting sample lyrics for {artist}")
    i = 0
    # download the lyrics for these songs
    for song in tqdm(artist_songs['response']['songs']):
        song_url_extension = song['path']
        success = False
        # need to keep trying because the page randomly doesn't work
        while success == False:
            try:
                lyrics = scrape_song_lyrics(song_url_extension)
                success = True
            except:
                time.sleep(5)
        with open(f"data/lyrics/{artist}_{i}.txt", "w") as text_file:
            text_file.write(lyrics)
        i+=1

Getting sample lyrics for MF_DOOM


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Getting sample lyrics for wu_tang


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))


Getting sample lyrics for outkast


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))


Getting sample lyrics for aesop_rock


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Getting sample lyrics for biggie


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Getting sample lyrics for big_l


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Getting sample lyrics for mos_def


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


Getting sample lyrics for kendrick_lamar


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Getting sample lyrics for tribe


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))


Getting sample lyrics for talib_kweli


HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




# Get the vocabularly which we will look rhymes up for 

## Lets load these files and store them as a corpus in a list 

In [6]:
lyric_corpus = []
for filename in os.listdir('data/lyrics/'):
    if filename.endswith(".txt"):
        with open(f"data/lyrics/{filename}", "r") as text_file:
            song_lyrics = text_file.read()
        lyric_corpus.append(song_lyrics)

## Get the vocab 

In [7]:
vocab = get_vocab(lyric_corpus)
print(len(vocab))

11259


# Iterate thorugh each word and get all the rhymes for each word 

To do this we will look up each word and get all the rhymes using the datamuse API. We will then create a row for each rhyme combination an assign them a rhyme group ID.

In [8]:
rhyme_data = []
rhyme_id = 1
rhyme_group_id = 1
for word in tqdm(vocab):
    rhyme_response = get_rhymes(word)
    if len(rhyme_response) > 0:
        # list all words returned by the response
        rhyming_words = [rhyme['word'] for rhyme in rhyme_response] + [word]
        all_rhyme_combinations = list(itertools.combinations(rhyming_words, 2))
        # create an entry for all possible rhyme pairs returned
        for rhyme_pair in all_rhyme_combinations:
            rhyme_data.append(
                {
                        'rhyme_id': rhyme_id,
                        'rhyme_group_id': rhyme_group_id,
                        'word_a': rhyme_pair[0],
                        'word_b': rhyme_pair[1],
                        'rhyme': 1
                }
            )
            rhyme_id+=1
        rhyme_group_id+=1
    # lets just save the dataframe every 500 words
    if rhyme_group_id % 500 == 0 and rhyme_group_id != 0:
        rhyme_df = pd.DataFrame(rhyme_data)
        rhyme_df.to_pickle('data/rhymes/rhyme_df.pkl')

# convert to dataframe
rhyme_df = pd.DataFrame(rhyme_data)
rhyme_df = rhyme_df.drop_duplicates(subset=['word_a', 'word_b'], keep='first')
rhyme_df.to_csv('data/rhymes/rhyme_df.csv', index=False)

HBox(children=(FloatProgress(value=0.0, max=11259.0), HTML(value='')))




In [8]:
rhyme_df = pd.read_csv('data/rhymes/rhyme_df.csv')
rhyme_df.loc[rhyme_df['rhyme_group_id']==10].sample(5).reset_index(drop=True)

Unnamed: 0,rhyme_id,rhyme_group_id,word_a,word_b,rhyme
0,37493,10,deported,warted,1
1,37476,10,transported,unreported,1
2,37339,10,courted,extorted,1
3,37606,10,shorted,aborted,1
4,37496,10,deported,port id,1


# Now create a dataframe of negative samples (i.e. words that don't rhyme) 

To do this we will basically iterate through each rhyme group ID, select a sample of words from another rhyme group (that shouldn't rhyme) and assign those words to the `word_b`column

In [26]:
non_rhyme_df = rhyme_df.copy()
for rhyme_group in tqdm(list(rhyme_df['rhyme_group_id'].drop_duplicates())):
    
    words_in_group = len(rhyme_df.loc[rhyme_df['rhyme_group_id'] == rhyme_group])
    
    other_rhyme_samples = list(
        non_rhyme_df.loc[non_rhyme_df['rhyme_group_id'] != rhyme_group, 'word_b'].sample(words_in_group)
    )
    
    non_rhyme_df.loc[non_rhyme_df['rhyme_group_id'] == rhyme_group, 'word_b'] = other_rhyme_samples
    
non_rhyme_df['rhyme'] = 0
non_rhyme_df = non_rhyme_df.drop_duplicates(subset=['word_a', 'word_b'], keep='first')
non_rhyme_df.to_csv('data/rhymes/non_rhyme_df.csv', index=False)
print('Done!')

HBox(children=(FloatProgress(value=0.0, max=9003.0), HTML(value='')))


3933721
Done!
