In [13]:
%%capture
pip install pretty_midi

In [14]:
import os
import pickle
import pretty_midi
from tqdm import tqdm
from torchtext.data import get_tokenizer
import pandas as pd
import numpy as np
import sys
import torch
import sklearn as sk
import zipfile
import csv
import string
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
DATA_PATH = 'ass3_file'
MIDI_PATH =  os.path.join(DATA_PATH,'midi_files/')
TRAIN_PATH =  os.path.join(DATA_PATH,'lyrics_train_set.csv')
TEST_PATH =  os.path.join(DATA_PATH,'lyrics_test_set.csv')
PICK_PATH = os.path.join(DATA_PATH,'pickle_file')

In [6]:
for file in os.listdir(MIDI_PATH):
    os.rename(MIDI_PATH + file, MIDI_PATH + file.lower())

In [7]:
train_df = (pd.read_csv(TRAIN_PATH, header = None)
            .rename(columns={0:'artist',1:'song',2:'lyrics'})
            .drop(columns=[3,4,5,6], axis=1))

test_df = (pd.read_csv(TEST_PATH, header = None)
            .rename(columns={0:'artist',1:'song',2:'lyrics'}))

# Preprocessing

In [2]:
def clean_lyrics(lyrics,word2vec):
    lyrics = lyrics.replace('&', '')
    lyrics = lyrics.replace('  ', ' ')
    lyrics = lyrics.replace('\'', '')
    lyrics = lyrics.replace('--', ' ')
    lyrics = lyrics.replace('[', '')
    lyrics = lyrics.replace(']', '')
    lyrics = lyrics.replace('-', ' ')
    
    
    return lyrics

In [3]:
def all_preprocessing(df,word2vec):
    
    tokenizer = get_tokenizer("basic_english")

    df['lyrics'] = df['lyrics'].apply(lambda lyrics: clean_lyrics(lyrics,word2vec))
    df['tokens'] = df['lyrics'].apply(lambda text: tokenizer(text))
    df['tokens'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token in word2vec])


    file_names = []
    #change the file name 
    for index,row in df.iterrows():
        file_names.append(row[0].replace(" ","_")+"_-_"+row[1].replace(" ","_")+".mid")

    df['file_name'] = file_names
    df['pretty_format'] = df['file_name'].apply(lambda file_name: convert_pretty_format(file_name))
    df = df[df['pretty_format'].notna()]
        
    return df

# Extract melodies features

In [11]:
def convert_pretty_format(file_name):
    try:
        file_path =  os.path.join(MIDI_PATH, file_name)
        return pretty_midi.PrettyMIDI(file_path)
    except:
        return None

def extract_mellidies_feature_method1(df,feature_version=1):
    all_sons_feature = []

    for index,row in tqdm(df.iterrows()):
        song_feature = []
        midi_file = row['pretty_format']
        num_words = len(row['tokens'])
        midi_file.remove_invalid_notes()
        mean_time_word = midi_file.get_end_time() / num_words  #calculate the mean time per word in every song

        for index_word in range(num_words):
            if feature_version == 1:
                word_feature = get_instructor_beats(midi_file, index_word, mean_time_word)
            else:
                word_feature = extract_piano_feature(midi_file, index_word, mean_time_word, num_words)
            song_feature.append(word_feature)

        all_sons_feature.append(np.array(song_feature))

    return all_sons_feature

def get_instructor_beats(midi_file, idx_word,mean_time):
    """
    Extract from each midi file - number of instructor, change beats - in eatch word time period.

    :idx_word - the index of word: to calculate the time range of the word.
    :mean_time - The mean time for the words in each song. 
    :return: Array that contain [numbe of beats, number of instructor]  
    """
    beats, instructor = 0,0
    start_time = idx_word * mean_time
    end_time = start_time + mean_time

    for beat in midi_file.get_beats():
        if start_time <= beat and beat <= end_time:
              beats +=1

    for instrument in midi_file.instruments:
        for note in instrument.notes:
              if start_time <= note.start and note.end <= end_time:
                    instructor+=1

    return np.array([beats,instructor])

def extract_piano_feature(midi_file, idx_word,mean_time,num_of_words):
    piano_roll = midi_file.get_piano_roll()
    notes_per_word = int(piano_roll.shape[1] / num_of_words) 

    start_index = idx_word * idx_word
    end_index = start_index + idx_word

    piano_for_lyric = piano_roll[:, start_index:end_index].transpose()
    piano_sum = np.sum(piano_for_lyric, axis=0)

    return piano_sum

In [9]:
def text2index(lst_tokens, word2index):
    return [word2index[token] for token in lst_tokens]

def create_sequences(encoded_lyrics_list, features_list, total_words, seq_length, word2vec, vector_size, tokens_tf,test = False,):
    """
    This function creates sequences from the lyrics
    :param encoded_lyrics_list: A list representing all the songs in the dataset (615 songs). Each cell contains a list
    of ints, where each int corresponds to the lyrics in that song. "I'm a barbie girl" --> [23, 52, 189, 792] etc.
    :param total_words: Number of words in our word2vec dictionary.
    :param seq_length: Number of words predating the word to be predicted.
    :return: (1) A numpy array containing all the sequences seen, concatenated.
             (2) A 2d numpy array where each row represents a word and the columns are the possible words in the
             vocabulary. There is a '1' in the corresponding word (e.g, word number '20,392' in the dataset is word
              number '39' in the vocab.
    """
    input_sequences = []
    next_words = []
    next_tf = []
    lst_features = []
    for idx, song_sequence in enumerate(encoded_lyrics_list):  # iterate over songs
        feature_sequence = features_list[idx]
        for i in range(seq_length, len(song_sequence), seq_length):  # iterate from minimal sequence length (number of words) to
            start_index = i - seq_length  # number of words in the song
            end_index = i
            # Slice the list into the desired sequence length
            sequence = song_sequence[start_index:end_index]
            features = feature_sequence[start_index:end_index]
            next_word = song_sequence[start_index+1:end_index+1]
            
            # append to lists
            input_sequences.append(sequence)
            lst_features.append(features)
            next_words.append(next_word)
            if test : 
                break
            next_tf.append([tokens_tf[tf] for tf in next_word])
            
    input_sequences = np.array(input_sequences)
    input_features = np.array(lst_features)
    word2vec_next_word = convert_to_word2vec(word2vec, next_words, vector_size, seq_length)
    return input_sequences, word2vec_next_word, np.array(next_tf), input_features

def convert_to_one_hot_encoding(input_sequences, next_words, total_words):
    """
    This function converts input to one hot encoding
    """
    one_hot_encoding_next_words = np.zeros((len(input_sequences), total_words), dtype=np.int8)
    for word_index, word in enumerate(next_words):
        one_hot_encoding_next_words[word_index, word] = 1
    return one_hot_encoding_next_words

def convert_to_word2vec(word2vec, next_word, vector_size, seq_length):
    word2vec_next_words = np.zeros((len(next_word), seq_length, vector_size), dtype=np.float32)
    for idx, sequence in enumerate(next_word):
        for idy, word in enumerate(sequence):
            word2vec_next_words[idx, idy,]= word2vec[word]
    return word2vec_next_words

# Word2Vec

In [None]:
def get_word2vec(word2vec_path, pre_trained, vector_size, encoding='utf-8') -> dict:
    """
    This function returns a dictionary that maps between word and a vector
    :param word2vec_path: path for the pickle file
    :param pre_trained: path for the pre-trained embedding file
    :param vector_size: the vector size for each word
    :param encoding: the encoding the the pre_trained file
    :return: dictionary maps between a word and a vector
    """
    # If the pickle file is already exists, read that file
    word2vec = _read_pickle_if_exists(word2vec_path)
    if word2vec is None:  # The pickle file is not exists.
        with open(pre_trained, 'r', encoding=encoding) as f:  # Read a pre-trained word vectors.
            list_of_lines = list(f)
        word2vec = _iterate_over_glove_list(list_of_lines=list_of_lines, vector_size=vector_size)
        _save_pickle(pickle_path=word2vec_path, content=word2vec)  # Save pickle for the next running                
    return word2vec

def get_word2vec_matrix(total_words, index2word, word2vec, vector_size):
    """
    This function creates a matrix where the rows are the words and the columns represents the embedding vector.
    We will use this matrix in the embedding layer
    :param total_words: Number of words in our word2vec dictionary.
    :param index2word: dictionary maps between index and word
    :param word2vec: dictionary maps between a word and a vector
    :param vector_size: the size of the embedding vector size
    :return: embedding layer
    """
    word2vec_matrix = np.zeros((total_words, vector_size))
    for index_word, word in index2word.items():
        if word not in word2vec:
            print(f'Can not find the word "{word}" in the word2vec dictionary')
            continue
        else:
            vec = word2vec[word]
            word2vec_matrix[index_word] = vec
    return word2vec_matrix

def _iterate_over_glove_list(list_of_lines, vector_size):
    """
    This function iterates over the glove list line by line and returns a word2vec dictionary
    :param list_of_lines: List of glove lines
    :param vector_size: the size of the embedding vector size
    :return: dictionary maps between a word and a vector
    """
    
    word2vec = {}
    punctuation = string.punctuation
    for line in list_of_lines:
        values = line.split(' ')
        word = values[0]
        if word in punctuation:
            continue
        vec = np.asarray(values[1:], "float32")
        if len(vec) != vector_size:
            raise Warning(f"Vector size is different than {vector_size}")
        else:
            word2vec[word] = vec
    return word2vec


def _save_pickle(pickle_path, content):
    """
    This function saves a value to pickle file
    :param pickle_path: path for the pickle file
    :param content: the value you want to save
    :return: Nothing
    """
    with open(pickle_path, 'wb') as f:
        pickle.dump(content, f)


def _read_pickle_if_exists(pickle_path):
    """
    This function reads a pickle file
    :param pickle_path:path for the pickle file
    :return: the saved value in the pickle file
    """
    pickle_file = None
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            pickle_file = pickle.load(f)
    return pickle_file

# All the preprocess

In [1]:
def preprocess(train_df, test_df, word2vec):    
    train_df = all_preprocessing(train_df, word2vec)
    test_df = all_preprocessing(test_df, word2vec)

    train_df['feature_method1'] = extract_mellidies_feature_method1(train_df)
    # train_df['feature_method2'] = extract_mellidies_feature_method1(train_df, feature_version=2)
    
    test_df['feature_method1'] = extract_mellidies_feature_method1(test_df)
    # test_df['feature_method2'] = extract_mellidies_feature_method1(test_df, feature_version=2)
    
    train_df = train_df.drop(['artist', 'song', 'lyrics', 'file_name', 'pretty_format'], axis = 1)
    test_df = test_df.drop(['artist', 'song', 'lyrics', 'file_name', 'pretty_format'], axis = 1)

    train_df.to_pickle(os.path.join(PICK_PATH,'train_df.pkl'))
    test_df.to_pickle(os.path.join(PICK_PATH,'test_df.pkl'))