In [2]:
import pandas as pd
import sklearn
from collections import Counter, defaultdict
import math
import random
import re
import numpy as np

In [5]:
df = pd.read_csv('data/english_cleaned_lyrics.csv')

df = df.dropna()
df = df.drop_duplicates()
genres = df['genre'].unique()
print(df.columns)
print(df.head)

Index(['Unnamed: 0', 'index', 'song', 'year', 'artist', 'genre', 'lyrics'], dtype='object')
<bound method NDFrame.head of         Unnamed: 0   index                         song  year  \
0                0       0                    ego-remix  2009   
1                1       1                 then-tell-me  2009   
2                2       2                      honesty  2009   
3                3       3              you-are-my-rock  2009   
4                4       4                black-culture  2009   
...            ...     ...                          ...   ...   
218205      362232  362232    who-am-i-drinking-tonight  2012   
218206      362233  362233                         liar  2012   
218207      362234  362234                  last-supper  2012   
218208      362235  362235  christ-alone-live-in-studio  2012   
218209      362236  362236                         amen  2012   

                 artist    genre  \
0       beyonce-knowles      Pop   
1       beyonce-knowles  

In [4]:
df['genre'].unique()

array(['Pop', 'Hip-Hop', 'Rock', 'Metal', 'Other', 'Country', 'Jazz',
       'Electronic', 'Folk', 'R&B', 'Indie'], dtype=object)

In [10]:
def preprocess_text(text):
    
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text


df['lyrics'] = df['lyrics'].apply(preprocess_text)

def build_ngram_model(texts, n):
    model = defaultdict(list)
    for text in texts:
        words = text.split()
        for i in range(len(words) - n):
            gram = tuple(words[i:i+n])
            next_word = words[i+n]
            model[gram].append(next_word)
    return model



ngram_size = 3
genre_models = {}

for genre in genres:
    genre_lyrics = df[df['genre'] == genre]['lyrics']
    genre_models[genre] = build_ngram_model(genre_lyrics, ngram_size)

def generate_lyrics(model, n, num_words=50):
    current_gram = random.choice(list(model.keys()))
    result = list(current_gram)
    for i in range(num_words - n):
        possibilities = model[current_gram]
        if not possibilities:
            break
        next_word = random.choice(possibilities)
        result.append(next_word)
        current_gram = tuple(result[-n:])
    return ' '.join(result)


In [11]:
genre_to_generate = 'Pop'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size, 200)
print(new_lyrics)

news for them ive got news for you who are you to keep a straight face and a brand new start you can laugh it off i can be your last day as a cocaine cowboy cowboy feat akon we the best dj khaled we global we the greatest lets go ayo joc got us akon konvict music this is survival used up and burnt out like an animal midnight coming for you tough im coming for you anyway so dont be blind to your game i feel some hesitation before we say goodnight take me in his arms dont even skip a beat wanna feel your body promise me the love that you dont like a big commotion im a demon for slow motion or such why should i care for you when i left your side but weve gotta go we got to keep the mystery to keep the home fires burning sure there are happy days and its not me talking you got me dizzy floating high up in the sky for the sacred in my winged water feathered river dirty rich soil strong and fertile the then shallow she earth as we know it lets really


In [47]:
genre_to_generate = 'Country'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size)
print(new_lyrics)

go told her i was just standin in line every heart has to do cause this feelin i keep feelin about you sue im gonna leave these blues behind for some other breed cause life aint about the money ill meet you in the eye where did you come from


In [49]:
genre_to_generate = 'Rock'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size)
print(new_lyrics)

at his wife he says how and you say that you have ever done this is the sound of it another day in the same old dirt from up in heaven and you look like a woman like her can tease me with a feeling and she starts bawlin im


In [50]:
genre_to_generate = 'R&B'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size)
print(new_lyrics)

here to you i promise well start a little ponzi scheme whoaaaa oh oh lets take a 7 month staycation whoaaaa oh oh lets take a 7 month staycation whoaaaa oh oh now kiss me through my current occupation ill move these rocks trying to stack this cash on the


In [51]:
genre_to_generate = 'Jazz'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size)
print(new_lyrics)

the way up to the skies but their attempts at love were only imitations of my old flame i cant even think of his name but its funny now and then it happened to me and although hes nothing in the end my tears give me away without you dear


In [52]:
genre_to_generate = 'Metal'
new_lyrics = generate_lyrics(genre_models[genre_to_generate], ngram_size, 200)
print(new_lyrics)

hand was only a fiction and my creation


In [6]:
df['word_count'] = df['lyrics'].apply(lambda x: len(x.split()))
genre_stats = df.groupby('genre')['word_count'].agg(['mean', 'std'])

In [12]:
num_lyrics_per_genre = 1000
lyrics_data = []

for genre, model in genre_models.items():
    mean_word_count = genre_stats.loc[genre, 'mean']
    std_word_count = genre_stats.loc[genre, 'std']
    for _ in range(num_lyrics_per_genre):
        num_words = max(int(np.random.normal(mean_word_count, std_word_count)), 1) 
        lyrics = generate_lyrics(model, ngram_size)
        lyrics_data.append({'genre': genre, 'lyrics': lyrics, 'artist': f'n-gram {ngram_size}'})


lyrics_df = pd.DataFrame(lyrics_data)
lyrics_df.to_csv('generated_lyrics.csv', index=False)