In [2]:
import pandas as pd
import numpy as np

from pprint import pprint

In [137]:
import logging
import argparse
import torch
import os

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, GPT2Model
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)

# Gensim for Topic Modelling
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

# Preprocessing
import re
import nltk
from nltk.corpus import stopwords
import pickle
from tqdm import tqdm

In [44]:
import pickle

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [56]:
def get_device(logger):
    """
    Get device model will be run on (GPU or CPU)
    :param logger: Logger object to note the device
    :return: device type, num_of_gpus
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))
    return device, n_gpu

In [37]:
np.random.seed(101)

In [8]:
data = pd.read_csv('sample_clean_100_15k.csv')
# Stopwords
eng_stopwords = stopwords.words('english')
def stopwords_removing(text):
     return ' '.join([word for word in text.split() if not(word in eng_stopwords)])

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   artist               1100 non-null   object
 1   song_name            1100 non-null   object
 2   closest_genre        1100 non-null   object
 3   lyric                1100 non-null   object
 4   length_lyric         1100 non-null   int64 
 5   actual_lyric_length  1100 non-null   int64 
 6   lyric_processed      1100 non-null   object
dtypes: int64(2), object(5)
memory usage: 60.3+ KB


In [21]:
enc = GPT2Tokenizer.from_pretrained('distilgpt2')
enc2 = GPT2Tokenizer.from_pretrained('distilgpt2')
head = GPT2LMHeadModel.from_pretrained('distilgpt2')

In [20]:
special_tokens_dict = {
        "additional_special_tokens": [
            '[s:genre]', '[s:song_name]', '[s:lyrics]',
            '[e:genre]', '[e:song_name]', '[e:lyrics]'
        ]
    }

In [22]:
enc.add_special_tokens(special_tokens_dict)
enc.added_tokens_encoder

{'[s:genre]': 50257,
 '[s:song_name]': 50258,
 '[s:lyrics]': 50259,
 '[e:genre]': 50260,
 '[e:song_name]': 50261,
 '[e:lyrics]': 50262}

In [12]:

data['lyric_processed'] = data.lyric.map(lambda x: re.sub('[(),\/.!?]', ' ', x))
data['lyric_processed'] = data.lyric_processed.map(lambda x: x.lower())
data['lyric_processed'] = data.lyric_processed.map(lambda x: stopwords_removing(x))
data.lyric_processed.head()

0    simon says glad today surely simon say youl li...
1    brian holland lamont dozier edward holland jr ...
2    like love see i'm always thinking oh oh oh tre...
3    mother mother there's many crying brother brot...
4    looking back little nappy headed boy worry chr...
Name: lyric_processed, dtype: object

In [52]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   artist               1100 non-null   object
 1   song_name            1100 non-null   object
 2   closest_genre        1100 non-null   object
 3   lyric                1100 non-null   object
 4   length_lyric         1100 non-null   int64 
 5   actual_lyric_length  1100 non-null   int64 
 6   lyric_processed      1100 non-null   object
dtypes: int64(2), object(5)
memory usage: 60.3+ KB


In [13]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

In [14]:
data_words = list(sent_to_words(data))

In [17]:
id2word = corpora.Dictionary(data_words)

texts = data_words

corpus = [id2word.doc2bow(text) for text in texts]

In [18]:
# model LDA
num_topics = 25

lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)

pprint(lda_model.print_topics())

[(6,
  '0.167*"artist" + 0.167*"song_name" + 0.167*"closest_genre" + 0.167*"lyric" '
  '+ 0.167*"length_lyric" + 0.167*"lyric_processed"'),
 (12,
  '0.839*"closest_genre" + 0.032*"artist" + 0.032*"song_name" + 0.032*"lyric" '
  '+ 0.032*"length_lyric" + 0.032*"lyric_processed"'),
 (14,
  '0.167*"artist" + 0.167*"song_name" + 0.167*"closest_genre" + 0.167*"lyric" '
  '+ 0.167*"length_lyric" + 0.167*"lyric_processed"'),
 (20,
  '0.167*"artist" + 0.167*"song_name" + 0.167*"closest_genre" + 0.167*"lyric" '
  '+ 0.167*"length_lyric" + 0.167*"lyric_processed"'),
 (24,
  '0.167*"artist" + 0.167*"song_name" + 0.167*"closest_genre" + 0.167*"lyric" '
  '+ 0.167*"length_lyric" + 0.167*"lyric_processed"'),
 (10,
  '0.839*"song_name" + 0.032*"artist" + 0.032*"closest_genre" + 0.032*"lyric" '
  '+ 0.032*"length_lyric" + 0.032*"lyric_processed"'),
 (22,
  '0.167*"artist" + 0.167*"song_name" + 0.167*"closest_genre" + 0.167*"lyric" '
  '+ 0.167*"length_lyric" + 0.167*"lyric_processed"'),
 (2,
  '0.167*

In [43]:
id2token = id2word.id2token

In [46]:
# Create psi matrix
psi_matrix = np.zeros((num_topics, enc.vocab_size))
lda_topics = lda_model.get_topics()
for i in range(len(id2token)):
    j = enc.convert_tokens_to_ids(id2token[i])
    psi_matrix[:, j] = lda_topics[:, i]
pickle.dump(psi_matrix, open('psi_matrix.pkl', 'wb'))

In [54]:
# Create theta matrix
num_corpora = len(corpus)
theta_matrix = np.zeros((num_corpora, num_topics))
for i, c in enumerate(corpus):
    for j, p in lda_model.get_document_topics(c):
        theta_matrix[i, j] = p
pickle.dump(psi_matrix, open('theta_matrix.pkl', 'wb'))

In [134]:
topic_words = lda_model.show_topics(num_topics=num_topics, formatted=False, num_words=len(id2word))

In [55]:
# Save all topic tokens
tokens_id = [lda_model.get_topic_terms(i, topn=10) for i in range(num_topics)]
all_topic_tokens = [[(id2token[i], p) for i, p in tokens_id_topic] for tokens_id_topic in tokens_id]
pickle.dump(all_topic_tokens, open('all_topic_tokens.pkl', 'wb'))

In [19]:
# data with token
temp = data.iloc[0]
temp

artist                                                    Memphis Minnie
song_name                    He's In the Ring (Doin' the Same Old Thing)
closest_genre                                                      Blues
lyric                  If Simon says be glad today\nI surely do what ...
length_lyric                                                         162
actual_lyric_length                                                   32
lyric_processed        simon says glad today surely simon say youl li...
Name: 0, dtype: object

In [29]:
ops = enc.added_tokens_encoder

In [123]:
encoded_data = []
for i, song in data.iterrows():
    genre = [ops['[s:genre]']]+enc.encode(song.closest_genre)+[ops['[e:genre]']] 
    song_name = [ops['[s:song_name]']]+enc.encode(song.song_name)+[ops['[e:song_name]']] 
    lyrics = [ops['[s:lyrics]']]+enc.encode(song.lyric)+[ops['[e:lyrics]']] 
    encoded_data.append((genre, song_name, lyrics))
    

In [124]:
# OPTIONAL: randomly dropped
selected_sentences = []
max_input_len = enc.model_max_length
for genre, song_name, lyric in encoded_data:
    genre_token = list([1] * len(genre))
    song_token = list([2] * len(song_name))
    lyric_token = list([3] * len(lyric))

    if np.random.rand() <= 0.25:
        position_ids = list(np.arange(0, len(lyric)))
        current_input = {
            'token_ids': lyric,
            'token_type_ids':lyric_token,
            'position_ids':position_ids
        }
    else:
        tokens_subset = []
        segment_subset = []

        if np.random.rand() > 0.2:
            tokens_subset += genre
            segment_subset += genre_token

        if np.random.rand() > 0.2:
            tokens_subset += song_name
            segment_subset += song_token

        tokens_subset += lyric
        segment_subset += lyric_token
        position_ids = list(np.arange(0, len(tokens_subset)))

        current_input = {
            'token_ids': tokens_subset,
            'token_type_ids':segment_subset,
            'position_ids':position_ids
        }
    if len(current_input['token_ids']) >= max_input_len:
        continue

    # Add padding to make the input max_input_len
    len_before_padding = len(current_input["token_ids"])
    padding = max_input_len - len_before_padding

    current_input["token_ids"] += list([0] * padding)
    current_input["token_type_ids"] += list([0] * padding)
    current_input["position_ids"] += list([0] * padding)

    # 4) Language Modelling Labels -> this is input_copy with padding assigned to -1,
    #    the position shifting is done in the library code.
    lm_labels = np.copy(current_input["token_ids"])
    lm_labels[np.where(lm_labels == 0)] = -1

    # Attention Mask, 1 = unmasked, 0 = masked
    attention_mask = list([1] * len_before_padding) + list([0] * padding)

    selected_sentences.append((
        current_input["token_ids"], current_input["token_type_ids"], current_input["position_ids"], attention_mask, lm_labels
    ))

In [125]:
inputs_mat = map(list, zip(*selected_sentences))

In [126]:
torch_dataset = [torch.tensor(t, device=torch.device(device)).unsqueeze(1) for t in inputs_mat]

In [130]:
train_data = TensorDataset(*torch_dataset)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=1)

In [133]:
head.to(device)
head.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [None]:
gradient_accumulation_steps = 1
max_grad_norm = 1.

In [155]:
past = None
for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
    tok_ids, tok_type_ids, pos_ids, att_mask, lm_labels = batch
    label_condition = torch.where(lm_labels==-1, torch.tensor(0, dtype=torch.int32).to(device), torch.tensor(lm_labels))
    outputs = head(
                    input_ids=tok_ids, 
                    past_key_values=past, 
                    attention_mask=att_mask, 
                    token_type_ids=tok_type_ids,
                    position_ids=pos_ids, 
                    topic_word_matrix=topic_words,
                    labels=label_condition
                )
    loss = outputs[0]
    # predicted_scores = outputs[1]
    # past = outputs[2]

    # Log the loss to TensorBoardX
    global_step = (1 * len(train_data_loader)) + (step + 1)

    # Normalise the loss (Simulates average of a batch)
    loss = loss / args.gradient_accumulation_steps
    loss.backward(retain_graph=True)

    if (step + 1) % gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(head.parameters(), args.max_grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

  label_condition = torch.where(lm_labels==-1, torch.tensor(0, dtype=torch.int32).to(device), torch.tensor(lm_labels))
Training:   0%|          | 0/1100 [00:00<?, ?it/s]


TypeError: forward() got an unexpected keyword argument 'topic_word_matrix'