In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import json
import glob
import tensorflow as tf
import time

#Add running from cmd.

SEQ_LEN = 40
BATCH_SIZE = 128


In [2]:
def get_urls(query, pages=1, get_top_only=True):
    urls = []
    for page in range(1, pages+1):
        url = f'https://www.ultimate-guitar.com/search.php?page={page}&title={query.replace(" ", "%20")}&type=200'
        r = requests.get(url)
        if r.status_code == 404: return urls #finished iterating over all pages.
        soup = BeautifulSoup(r.text)
        j = soup.find("div", {"class": "js-store"})['data-content']
        tabs = json.loads(j)['store']['page']['data']['results']

        song_id = 0
        for tab in tabs:
            if 'marketing_type' in tab: continue #ignore paid tabs.
            if not get_top_only: urls.append(tab['tab_url']); continue
            if song_id == 0: song_id, rating, url = tab['song_id'], tab['rating'], tab['tab_url']
                
            if song_id != tab['song_id']:
                if rating > 3: #don't append bad tabs.
                    urls.append(url)
                    print(f'Best: {rating} - {url}')
                song_id = tab['song_id']
                rating = tab['rating']
                url = tab['tab_url']

            if tab['rating'] > rating:
                rating = tab['rating']
                url = tab['tab_url']
                
    return urls

def get_urls_top(): #gets top 100 tabs.
    urls = []
    url = 'https://www.ultimate-guitar.com/top/tabs?order=hitsdailygroup_desc&type=tab'
    r = requests.get(url)
    soup = BeautifulSoup(r.text)
    j = soup.find("div", {"class": "js-store"})['data-content']
    tabs = json.loads(j)['store']['page']['data']['tabs']#['results']
    print(tabs)

    for tab in tabs:
        if 'marketing_type' in tab: continue #paid tabs.
        urls.append(tab['tab_url'])
    return urls

queries = ['pink floyd', 'beatles', 'jimi hendrix' , 'arctic monkeys', 'radiohead', 'fall out boy', 'black sabbath']
#urls = get_urls('pink floyd', 10)
#urls = get_urls_top(2)

urls = []
#urls

In [3]:

def get_tab(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    j = soup.body.find_all("div")[2]['data-content']
    text = json.loads(j)['store']['page']['data']["tab_view"]['wiki_tab']['content']
    return text

for url in urls:
    time.sleep(2) #spamming requests blocks you.
    text = get_tab(url)
    name = url.split('/')[-1]
    with open(f"tabs/{name}", 'w') as f:
        try:
            f.write(text)
        except: continue
    print(f'{name} downloaded.')

In [4]:
import struct
import os


tab_dict = {'-': 1, '1': 2, '2': 3, '0': 4, '7': 5, '5': 6, '4': 7, '3': 8, '9': 9, 'x': 10, '6': 11, 'b': 12, '(': 13, ')': 14, '8': 15, '/': 16, '~': 17, 'p': 18, 'h': 19, '=': 20, '\\': 21, 's': 22, "'": 23, 'r': 24}
seq_dict = {0: 'O', 1: '-', 2: '1', 3: '2', 4: '0', 5: '7', 6: '5', 7: '4', 8: '3', 9: '9', 10: 'x', 11: '6', 12: 'b', 13: '(', 14: ')', 15: '8', 16: '/', 17: '~', 18: 'p', 19: 'h', 20: '=', 21: '\\', 22: 's', 23: "'", 24: 'r'}
CHAR_NUM = len(tab_dict) + 1

def one_hot(arr):
    encoded = np.zeros((len(arr), len(tab_dict)+1), dtype='bool')
    for i, n in enumerate(arr):
        encoded[i][n] = 1
    return encoded


class TabGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_path='data', batch_size=BATCH_SIZE, shuffle=True):
        self.timesteps = os.path.getsize('data') // 6 #number of bytes in dataset/6 = timesteps.
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(self.timesteps - SEQ_LEN)
        if self.shuffle: np.random.shuffle(self.indices)
        
    def __len__(self):
        return (self.timesteps - SEQ_LEN)//self.batch_size

    def __getitem__(self, index): #get batch
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        
        x = [] #x batch/seq/6*chars
        #y 6/batch/chars
        y = []; y2 = []; y3 = []; y4 = []; y5 = []; y6 = [];
        for i in indices:
            ix, iy = self.read_seq(i)
            x.append(ix)
            #y.append(iy) + return x,y doesn't work for some reason (for multi output generator/keras)
            y.append(iy[0]); y2.append(iy[1]); y3.append(iy[2]) ;y4.append(iy[3]); y5.append(iy[4]); y6.append(iy[5])
        return np.array(x), {'output1': np.array(y), 'output2': np.array(y2), 'output3': np.array(y3), 'output4': np.array(y4), 'output5': np.array(y5), 'output6': np.array(y6)} 

                    
    def read_seq(self, index): #reads, unpacks and one-hots.
        with open('data', 'rb') as f:
            f.seek(index)
            d = f.read(6*SEQ_LEN)
            unpacked = struct.unpack(f'{6*SEQ_LEN}B', d)
            seq = one_hot(unpacked) #6*SEQ_LEN/chars > SEQ_LEN/6*
            seq = seq.reshape(SEQ_LEN, 6*CHAR_NUM)
            x = seq[:-1]
            y = seq[-1].reshape(6, -1)
            return x, y


    def on_epoch_end(self):
        if self.shuffle: np.random.shuffle(self.indices)
        
#Seperates tab from other notation
def seperate_tabs(text):
    tabs = []
    count = 0
    text = text.replace('[tab]', '').replace('[/tab]', '').replace(' ', '')
    text_split = text.split('\n')
    for i, l in enumerate(text_split):
        if len(l) < 10: count = 0; continue
        if l[0] == '|' and l[1] in '-123456789(': #if string is not specified, label the correct string (assuming EADGBe)
            if count == 6: return #Tabs are for 7+ strings. Disregard the tab.
            string = 'eBGDAE'[count]
            count += 1
            l = string + l
            
        else: count = 0
        
        if l[0].lower() in 'ebgda' and l[1] == '|':
            if l[0] == 'E' and i>0 and len(text_split[i-1])>0 and text_split[i-1][0] not in 'A|': l = 'e' + l[1:] #sometimes people label high e as E. lowercase it.
            stripped_line = l.replace('|', '').replace('\r', '').replace(' ', '')
            tabs.append(stripped_line)
    return tabs


def pad_tabs(tabs, end_padding=0):
    longest = max([len(l) for l in tabs])
    tabs = [l + '-' * (longest-len(l)+end_padding) for l in tabs]
    return tabs



#Concatenates tabs by common string.
def join_tabs(tabs):
    strings = {}
    for l in tabs:
        if l[0] not in strings:
            strings[l[0]] = ''
            continue
        strings[l[0]] += l[1:]
    tab_list = list(strings.values())
    tab_list = pad_tabs(tab_list, SEQ_LEN)

    return tab_list

text_to_seq = lambda text: [tab_dict.get(s.lower()) or 0 for s in text] #0 is out of vocab.
seq_to_text = lambda seq: [seq_dict[n] for n in seq]

def tab_to_seq(tab):
    for i, s in enumerate(tab):
        tab[i] = text_to_seq(s)
    return tab


def write_tab_binary(tab_list): #custom file format: byte = tab-char index, 6bytes = timestep.
    with open('data', 'ab') as f:
        for i in range(len(tab_list[0])):
            binary = struct.pack('6B', tab_list[0][i], tab_list[1][i], tab_list[2][i], tab_list[3][i], tab_list[4][i], tab_list[5][i])
            f.write(binary)
        

tab_lists = []
for filename in glob.glob('tabs/*'):
    with open(filename, 'r') as f:
        try:
            text = f.read()
        except: continue
    tabs = seperate_tabs(text)
    if not tabs: continue
    tab_list = join_tabs(tabs)
    if len(tab_list) != 6 or len(tab_list[0]) < SEQ_LEN+1: continue #if string_num isn't 6 or tab length is shorter than 1 sequence__lem, skip tab
    tab_list = tab_to_seq(tab_list)
    write_tab_binary(tab_list)


In [18]:
from tensorflow.keras import layers


model_input = layers.Input((SEQ_LEN-1, 6*(len(tab_dict)+1)))

m = layers.LSTM(256, return_sequences=True)(model_input)
m = layers.LSTM(256)(m)
m = layers.Dense(512)(m)
m = layers.LeakyReLU(0.3)(m)
m = layers.Dense(512)(m)
m = layers.LeakyReLU(0.3)(m)
out = layers.Dense((len(tab_dict)+1), activation='softmax', name='output1')(m)
out2 = layers.Dense((len(tab_dict)+1), activation='softmax', name='output2')(m)
out3 = layers.Dense((len(tab_dict)+1), activation='softmax', name='output3')(m)
out4 = layers.Dense((len(tab_dict)+1), activation='softmax', name='output4')(m)
out5 = layers.Dense((len(tab_dict)+1), activation='softmax', name='output5')(m)
out6 = layers.Dense((len(tab_dict)+1), activation='softmax', name='output6')(m)


model = tf.keras.Model(inputs=model_input, outputs=[out,out2,out3,out4,out5,out6], name="tab_generator")
model.summary()

gen = TabGenerator()

class_weights = {} #set class weights:
for i in range(CHAR_NUM): class_weights[i] = 1
class_weights[1] = 0.05
#Class weights are not supported for multiple outputs. Will need to make a custom loss function.

opt = tf.keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy')
model.fit(gen, epochs=20)#, validation_data=(x_test,y_test))

Model: "tab_generator"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_14 (InputLayer)           [(None, 39, 150)]    0                                            
__________________________________________________________________________________________________
lstm_25 (LSTM)                  (None, 39, 256)      416768      input_14[0][0]                   
__________________________________________________________________________________________________
lstm_26 (LSTM)                  (None, 256)          525312      lstm_25[0][0]                    
__________________________________________________________________________________________________
dense_24 (Dense)                (None, 512)          131584      lstm_26[0][0]                    
______________________________________________________________________________________

Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x25023423f70>

In [25]:
model.fit(gen, epochs=25)#, validation_data=(x_test,y_test))

predictions = []
data = gen[18][0][5]
for i in range(36):
    prediction = model.predict(data.reshape(1, SEQ_LEN-1, 6*CHAR_NUM)) #prediction shape: (6, 1 ,chars) 6* [[chars]]
    text_prediction = [seq_dict[np.argmax(s)] for s in prediction]
    predictions.append(text_prediction)
    prediction = np.array(prediction)
    prediction = prediction.reshape(1, -1) #(1,6, chars) 
    data[:-1] = data[1:]; data[-1] = prediction

predictions = np.array(predictions)
for i in range(6):
    print(''.join(predictions[:, i]))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
--05-/~315~5x145xO6)23)2O2-7OxOxO1~O
--1----5-33-7~25/56O-3O3--1x(~-3717-
--6-3-1--31--024-33043)3--0-0-53737-
-60-0----3---1-4-5---3-0--3-1-x35-4~
-40------5-5-----~--------3---~0---r
-)------/----1---~--------5---~-~-s3


In [28]:
model.save('checkpoints/model1')



INFO:tensorflow:Assets written to: model1\assets


INFO:tensorflow:Assets written to: model1\assets
