In [1]:
import re
import markovify
from sentence_transformers import SentenceTransformer, util
import json
import pyphen
import pyphonetics as phono
import pronouncing
import praatio

In [2]:
text = open('lyrics/eminem.txt', 'rb').read().decode(encoding='utf-8')
text = re.sub('\[.*\]', '', text)
print(text[:500])

Look, I was gonna go easy on you not to hurt your feelings
But I'm only going to get this one chance
(Six minutes, six minutes)
Something's wrong, I can feel it
(Six minutes, six minutes, Slim Shady, you're on)
Just a feeling I've got
Like something's about to happen
But I don't know what
If that means, what I think it means, we're in trouble
Big trouble. And if he is as bananas as you say
I'm not taking any chances
You are just what the doc ordered

I'm beginning to feel like a Rap God, Rap God


In [3]:
bars =  text.split('\n')
bars = [[re.sub(r'[^a-zA-Z0-9]', '', word) for word in bar.split(' ')] for bar in bars]

word_text = re.sub('\n', ' ', text)
words = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in word_text.split(' ')]
words = [word for word in words if word != '']

max = 0
for bar in bars:
    if len(bar) > max: max = len(bar)
print('Longest bar: ', max)

bar_lengths = [len(bar) for bar in bars]
print('average bar length: ', round(sum(bar_lengths)/len(bar_lengths), 1), ' words')

cutoff = 15
print(f'prop of bars less then {cutoff} words long: ', round(len([bar for bar in bar_lengths if bar < cutoff]) / len(bar_lengths), 2))

Longest bar:  69
average bar length:  7.7  words
prop of bars less then 15 words long:  0.97


In [4]:
#unique, frequency = np.unique(np.array(words), return_counts=True)

freq_dict = {}

for index, word in enumerate(words):
    if word in freq_dict:
        freq_dict[word] = freq_dict[word] + 1
    else: freq_dict[word] = 1

vocab = sorted(list(freq_dict.keys()))

top = sorted(freq_dict.items(), key=lambda x:x[1], reverse=True)[:40]


In [5]:
markov_text = markovify.NewlineText(text, state_size=3)

In [6]:
for i in range(10):
    next_bar = None
    while next_bar is None:
        next_bar = markov_text.make_sentence(tries = 20)
    print(next_bar)

I was gonna go easy on you not to let me play with it, eh?
Turned around and got shot
Bloodsucking succubuses, what the fuck I was white,
But I still rap like I'm on the clock punching this time card
She puts the lotion in the bucket, it puts the lotion in the bucket, it puts the lotion on the skin
And what do I know?
And raise it, you better never let it go to whoever's holding the most current beef on their shoulders
Is critics never ask me how I'm doing
I'm better than I ever could of asked
Then he aimed at his own face, mosh now or die


In [7]:
model = SentenceTransformer('stsb-roberta-large')

In [8]:
stop_words = json.load(open('utils/stop_words_english.json', encoding='utf-8'))
stop_words.append('__BEGIN__')
stop_words.append('__END__')

In [9]:
def similarity(first, second):
    first_vec = model.encode(first, convert_to_tensor=True)
    second_vec = model.encode(second, convert_to_tensor=True)
    return util.pytorch_cos_sim(first_vec, second_vec).item()

In [10]:
# total_keywords = len(markov_text.chain.model)
# ctr = 0
# for keywords, next_words in markov_text.chain.model.items():

#     print(f'{ctr}/{total_keywords}', end = '\r')
#     ctr = ctr + 1
    
#     semantic_words = []

#     for keyword in keywords:
#         if not re.sub(r'[^a-zA-Z0-9]', '', keyword).lower() in stop_words:
#             semantic_words.append(keyword)

#     if len(semantic_words) > 0:
#         key_string = ' '.join(semantic_words)

#         for word, freq in next_words.items():
#             if not re.sub(r'[^a-zA-Z0-9]', '', word).lower() in stop_words:
#                 next_words[word] = round(next_words[word] * similarity(key_string, word) * 1000)
#             else: next_words[word] = next_words[word] * 750

In [11]:
# model_json = markov_text.to_json()
# with open('markov_models/eminem.json', 'w') as f:
#     json.dump(model_json, f)

In [12]:
saved_model = markovify.Text.from_json(json.load(open(('markov_models/semantic_eminem.json'))))

In [20]:
metaphone = phono.Metaphone()
fuzzy_soundex = phono.FuzzySoundex()
next_semantic_bar = 'Hi my name is eminem'
bars = []
ctr = 0
for i in range(4):

    prev_semantic_bar = next_semantic_bar
    next_bar = None
    corr = 0

    while next_bar is None or not rhyme or corr < .3:

        rhyme = False

        next_bar = saved_model.make_sentence(tries=100)
        #print(f'tries: {ctr}', end = '\r')
        ctr = ctr + 1
        if not next_bar is None: 

            next_semantic_bar = ' '.join([word for word in next_bar.split(' ') if re.sub(r'[^a-zA-Z0-9]', '', word).lower() not in stop_words]) 
            corr = similarity(prev_semantic_bar, next_semantic_bar)
            if corr < .3: pass

            for next_word in next_semantic_bar.split(' '):
                for prev_word in prev_semantic_bar.split(' '):
                    if next_word in pronouncing.rhymes(prev_word):
                         rhyme = True 
                         pass

                    try: 
                        if metaphone.distance(next_word, prev_word, metric='levenshtein') == 1 or \
                            fuzzy_soundex.distance(next_word, prev_word, metric='levenshtein') == 1: 
                            #print(corr)
                            rhyme = True
                            pass
                    except: pass
        
    bars.append(next_bar)
    print(next_bar)

It seems that there is not an egomaniac that's not his motto
Now whether you're black, white or a albino yeah
A beacon of hope, put a B-I-R-D in the air that's making me high
Awful, every time I think of all of the whoopdy whoop,


In [14]:
rhyme_test = [
    ['too', 'you'],
    ['heavy', 'spaghetti'],
    ['summit', 'was it'],
    ['attack', 'back'],
    ['chance', 'fans'],
    ['gaping', 'taking'],
    ['pout', 'doubt'],
    ['shout', 'out'],
    ['squelched', 'belched'],
    ['smooth', 'soothe'],
    ['agile', 'fragile'],
    ['therein', 'wherein'],
    ['barrage', 'garage'],
    ['extinguish', 'distinguish'],
    ['crucial', 'fiducial'],
    ['diminished', 'finished'],
    ['brilliant', 'resilient'],
    ['behaviour', 'saviour'],
     ['annihilated', 'violated'],
]

algos = {
    'Soundex': 0,
    'Metaphone': 0,
    'Refined Soundex': 0,
    'Fuzzy Soundex': 0,
    'Lein': 0,
    'Matching Rating Approach': 0
}

soundex = phono.Soundex()
metaphone = phono.Metaphone()
refined_soundex = phono.RefinedSoundex()
fuzzy_soundex = phono.FuzzySoundex()
lein = phono.Lein()
matching_rating = phono.MatchingRatingApproach()

for pair in rhyme_test:

    algos['Soundex'] = algos['Soundex'] + soundex.distance(pair[0], pair[1], metric='levenshtein')
    algos['Metaphone'] = algos['Metaphone'] + metaphone.distance(pair[0], pair[1], metric='levenshtein')
    algos['Refined Soundex'] = algos['Refined Soundex'] + refined_soundex.distance(pair[0], pair[1], metric='levenshtein')
    algos['Fuzzy Soundex'] = algos['Fuzzy Soundex'] + fuzzy_soundex.distance(pair[0], pair[1], metric='levenshtein')
    algos['Lein'] = algos['Lein'] + lein.distance(pair[0], pair[1], metric='levenshtein')
    algos['Matching Rating Approach'] = algos['Matching Rating Approach'] + matching_rating.distance(pair[0], pair[1], metric='levenshtein')
    

algos

{'Soundex': 38,
 'Metaphone': 35,
 'Refined Soundex': 57,
 'Fuzzy Soundex': 32,
 'Lein': 40,
 'Matching Rating Approach': 39}

In [15]:
pyphen.language_fallback('en_US_someVariant')
dic = pyphen.Pyphen(lang='en_US')
dic.inserted('hello')

phonetic_bars = []
for bar in bars:
    phonetic_bars.append(' '.join([dic.inserted(word) for word in bar.split(' ')]))

print(phonetic_bars)

with open('phonetic_bars.json', 'w') as f:
    json.dump(phonetic_bars, f)

["I'll nev-er get it back", "I bet-ter do some-thing quick if I'-ma be ac-cused, might as well come out now"]
