In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict,train_test_split, StratifiedKFold 
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dropout, Dense, Concatenate, concatenate, Embedding
from tensorflow.keras.layers import MaxPooling1D, Conv1D, GlobalMaxPooling1D, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import he_normal, glorot_normal
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Nadam, Adagrad, Adamax
import regex as re
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
#nltk.download("punkt")
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

np.random.seed(42)
tf.random.set_seed(42)

2024-06-20 14:25:27.118184: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# nlp = spacy.load("en_core_web_lg")

DATA_PATH = '/home/paulo/anaconda3/envs/studies/TMCI_Project-master/lyrics.csv' 
df_full = pd.read_csv(DATA_PATH)
df_full = df_full.dropna()
df_full = df_full.drop(df_full.loc[(df_full['year'] < 1900) | (df_full['year'] > 2024)].index)
df_full = df_full.drop(df_full[(df_full['genre'] == 'Not Available') | (df_full['genre'] == 'Other')].index)

In [3]:
df = df_full[['lyrics', 'genre']]

def clean_lyrics(text):
    text = re.sub(r'[^A-Za-z ]', ' ', text) # Remover caracteres especiais
    
    words = text.split()  # Remover letras com menos de 3 palavras
    if len(words) < 3:
        return ""
    text = ' '.join(words)
    
    return text.strip()  
 
    
df_full['clean_lyrics'] = df_full['lyrics'].apply(clean_lyrics)

df_clean = df_full[df_full['clean_lyrics'] != ""] # Excluir linhas com menos de 3 palavras

In [4]:
genre_frequency = df_clean['genre'].value_counts()

In [5]:
def balance_classes(df, target_column, max_instances):
    balanced_df = pd.DataFrame() # Ordena o grupo pelo número de palavras e seleciona os primeiros max_instances
    for class_label, group in df.groupby(target_column):
        if len(group) > max_instances:
            group = group.assign(word_count=group[target_column].apply(lambda x: len(str(x).split())))
            group = group.sort_values(by='word_count', ascending=False).head(max_instances)
        balanced_df = pd.concat([balanced_df, group])
    return balanced_df

In [6]:
max_instances_per_class = 3390
balanced_df = balance_classes(df_clean, 'genre',max_instances_per_class)

# Preprocessing Pipeline:

In [7]:
def remove_tabs_new_line(lyrics):
    string = re.sub(r"\s+"," ",lyrics)
    string = re.sub(r"[/]"," ",string)
    return string

def decontractions(lyrics):
    decontracted = re.sub(r"won\'t", "will not", lyrics)
    decontracted = re.sub(r"can\'t", "can not", decontracted)
    decontracted = re.sub(r"n\'t", " not", decontracted)
    decontracted = re.sub(r"\'re", " are", decontracted)
    decontracted = re.sub(r"\'s", " is", decontracted)
    decontracted = re.sub(r"\'d", " would", decontracted)
    decontracted = re.sub(r"\'ll", " will", decontracted)
    decontracted = re.sub(r"\'t", " not", decontracted)
    decontracted = re.sub(r"\'ve", " have", decontracted)
    decontracted = re.sub(r"\'m", " am", decontracted)
    decontracted = re.sub(r"ain\'t", "is not", decontracted)
    decontracted = re.sub(r"\'cause", "because", decontracted)
    decontracted = re.sub(r"y\'all", "you all", decontracted)
    decontracted = re.sub(r"ma\'am", "madam", decontracted)
    decontracted = re.sub(r"o\'clock", "of the clock", decontracted)
    decontracted = re.sub(r"gonna", "going to", decontracted)
    decontracted = re.sub(r"wanna", "want to", decontracted)
    decontracted = re.sub(r"gotta", "got to", decontracted)
    decontracted = re.sub(r"let\'s", "let us", decontracted)
    decontracted = re.sub(r"how\'d", "how did", decontracted)
    decontracted = re.sub(r"how\'ll", "how will", decontracted)
    decontracted = re.sub(r"how\'s", "how is", decontracted)
    decontracted = re.sub(r"what\'d", "what did", decontracted)
    decontracted = re.sub(r"what\'ll", "what will", decontracted)
    decontracted = re.sub(r"what\'re", "what are", decontracted)
    decontracted = re.sub(r"what\'s", "what is", decontracted)
    decontracted = re.sub(r"what\'ve", "what have", decontracted)
    decontracted = re.sub(r"where\'d", "where did", decontracted)
    decontracted = re.sub(r"where\'ll", "where will", decontracted)
    decontracted = re.sub(r"where\'re", "where are", decontracted)
    decontracted = re.sub(r"where\'s", "where is", decontracted)
    decontracted = re.sub(r"where\'ve", "where have", decontracted)
    decontracted = re.sub(r"who\'d", "who did", decontracted)
    decontracted = re.sub(r"who\'ll", "who will", decontracted)
    decontracted = re.sub(r"who\'re", "who are", decontracted)
    decontracted = re.sub(r"who\'s", "who is", decontracted)
    decontracted = re.sub(r"who\'ve", "who have", decontracted)
    decontracted = re.sub(r"why\'d", "why did", decontracted)
    decontracted = re.sub(r"why\'ll", "why will", decontracted)
    decontracted = re.sub(r"why\'re", "why are", decontracted)
    decontracted = re.sub(r"why\'s", "why is", decontracted)
    decontracted = re.sub(r"why\'ve", "why have", decontracted)
    decontracted = re.sub(r"that\'d", "that would", decontracted)
    decontracted = re.sub(r"that\'ll", "that will", decontracted)
    decontracted = re.sub(r"that\'re", "that are", decontracted)
    decontracted = re.sub(r"that\'s", "that is", decontracted)
    decontracted = re.sub(r"that\'ve", "that have", decontracted)
    return decontracted

def lower_case(lyrics):
    return lyrics.lower() 

def is_english(lyrics):
    try:
        detect(lyrics) == 'en'
        return lyrics
    except:
        return ''
    
def remove_stopwords(tokens):  
    tokens_without_stopwords = [word for word in tokens if word not in STOP_WORDS]
    return tokens_without_stopwords

def remove_short_words(tokens, N):
    filtered_lyrics = [word for word in tokens if len(word) > N]  # Filtra palavras com tamanho >= N
    return filtered_lyrics

def remove_repeated_words(tokens):
    unique_words = []
    seen_words = set()
    for word in tokens:
        if word not in seen_words:
            unique_words.append(word)
            seen_words.add(word)
    return unique_words

def final_processing(lyrics):  
    string = re.sub(r'[^a-z\s]', '', lyrics)
    string = re.sub(r'\bverse\b', '', string)
    string = re.sub(r'\bchorus\b', '', string)
    string = re.sub(r'\s+', ' ', string).strip()
    return string

In [8]:
def preprocess_pipeline(lyrics):
    lyrics = remove_tabs_new_line(lyrics)
    lyrics = lower_case(lyrics)
    #lyrics = is_english(lyrics)
    lyrics = decontractions(lyrics) # Decontractions like: I've -> I have
    tokens = word_tokenize(lyrics) 
    tokens = remove_stopwords(tokens)
    tokens = remove_short_words(tokens, N=2)
    #tokens = remove_repeated_words(tokens)
    lyrics = ' '.join(tokens)
    lyrics = final_processing(lyrics)
    
    return lyrics

In [9]:
balanced_df['processed_lyrics'] = balanced_df['lyrics'].apply(preprocess_pipeline)

In [10]:
def words_frequency_treshhold(word_count, N):
    N_most_common_words = []
    for word, count in word_count.items():
        if count > N:
            N_most_common_words.append(word)
    return N_most_common_words

def remove_words(text, words_to_remove):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in words_to_remove]
    return ' '.join(filtered_tokens)

In [11]:
max_frequency = 1900
embeddings_dim = 300

In [12]:
all_lyrics = ' '.join(balanced_df['processed_lyrics'])
words = all_lyrics.split()

word_count = Counter(words)

most_common_words = words_frequency_treshhold(word_count, max_frequency)


balanced_df['processed_lyrics'] = balanced_df['processed_lyrics'].apply(lambda lyrics: remove_words(lyrics, most_common_words))

In [13]:
embeddings_index = dict()

'''f = open('./glove.6B/glove.6B.'+ str(embeddings_dim)  +'d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float16)
    embeddings_index[word] = coefs
f.close()'''

with open('unbalanced_vectors'+str(max_frequency)+'.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype=np.float32)
        embeddings_index[word] = coefs

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(balanced_df['processed_lyrics'])
vocab_size = len(tokenizer.word_index) + 1


In [15]:
embedding_matrix = np.zeros((vocab_size, embeddings_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [16]:
encoded_docs = tokenizer.texts_to_sequences(balanced_df['processed_lyrics'])

In [17]:
balanced_df['encoded_docs'] = encoded_docs
balanced_df['encoded_docs'].shape

(32316,)

In [18]:
df_filtred = balanced_df[balanced_df['encoded_docs'].apply(len) >= 14]
df_filtred['genre'].value_counts()

genre
Hip-Hop       3312
Pop           3190
Metal         3162
Country       3093
Rock          3060
R&B           3036
Jazz          3012
Electronic    2897
Indie         2839
Folk          1999
Name: count, dtype: int64

In [19]:
#treshhold = min(df_filtred['genre'].value_counts().tolist())
#true_balanced_df = balance_classes(df_filtred, 'genre', treshhold)

In [20]:
#true_balanced_df.info()

In [21]:
max_length = 170
X = pad_sequences(df_filtred['encoded_docs'], maxlen=max_length, padding='post')

In [22]:
X.shape

(29600, 170)

In [24]:
max_length_for_padding=[] 
for i in df_filtred['encoded_docs']:  
  max_length_for_padding.append(len(i))  
print(sorted(max_length_for_padding))

[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,

In [25]:
one_hot = pd.get_dummies(df_filtred['genre'])
y = one_hot.to_numpy()

In [26]:
def create_model():
    input_layer =  Input(shape=(max_length),dtype='int32')
    e = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=max_length, trainable=True)(input_layer)
    
    x1=Conv1D(3,7,kernel_initializer='glorot_uniform',kernel_regularizer=l2(),activation='relu')(e)
    x2=Conv1D(3,8,kernel_initializer='glorot_uniform',kernel_regularizer=l2(),activation='relu')(e)
    x3=Conv1D(3,6,kernel_initializer='glorot_uniform',kernel_regularizer=l2(),activation='relu')(e)

    concatted1= concatenate([x1,x2,x3],axis=1)
    
    max_pool1=MaxPooling1D(3)(concatted1)

    y1=Conv1D(3,9,kernel_initializer='glorot_uniform',activation='relu')(max_pool1)
    y2=Conv1D(3,4,kernel_initializer='glorot_uniform',activation='relu')(max_pool1)
    y3=Conv1D(3,5,kernel_initializer='glorot_uniform',activation='relu')(max_pool1)

    concatted2 =concatenate([y1,y2,y3],axis=1)
    max_pool2=MaxPooling1D(3)(concatted2)
    
    drop_out=Dropout(0.5)(max_pool1)
    max_pool3=MaxPooling1D(3)(drop_out)

    conv_layer1=Conv1D(3,12, activation='relu')(max_pool3)

    flatten=Flatten()(conv_layer1)
    
    drop_out2=Dropout(0.5)(flatten)
    dense_layer1=Dense(16, activation='relu')(drop_out2)
    output_layer=Dense(10, activation='softmax')(dense_layer1)

    model = Model(inputs=input_layer,outputs=output_layer)
    model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [None]:
num_folds = 5
kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
f1_per_fold = []
loss_per_fold = []

for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X, np.argmax(y, axis=1))):
    
    print(f'Fold {fold_idx + 1}/{num_folds}')
    
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
    
    model = create_model()
    early_stopping = EarlyStopping(patience=4, restore_best_weights=True)
    history = model.fit(X_resampled,y_resampled, batch_size=32, epochs=30, validation_data=(X_val, y_val), callbacks=[early_stopping])

    y_pred = model.predict(X_val)  # Validation Test
    y_pred_classes = np.argmax(y_pred, axis=1)
    
    f1 = f1_score(np.argmax(y_val, axis=1), y_pred_classes, average='weighted') # Metrics
    f1_per_fold.append(f1)
    scores = model.evaluate(X_val, y_val) 
    loss_per_fold.append(scores[0])
    
    print(f'F1 Score do fold {fold_idx + 1}: {f1}')

print(f'\nF1 Score médio nos {num_folds} folds: {np.mean(f1_per_fold)} (+/- {np.std(f1_per_fold)})')

Fold 1/5


2024-06-20 14:32:09.155194: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs_range = range(1, len(history.epoch) + 1)

plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Train Set')
plt.plot(epochs_range, val_acc, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Train Set')
plt.plot(epochs_range, val_loss, label='Val Set')
plt.legend(loc="best")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Model Loss')

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Criação da matriz de confusão
y_val_classes = np.argmax(y_val.astype(int), axis=1)
conf_matrix = confusion_matrix(y_val_classes, y_pred_classes)

# Plotagem da matriz de confusão com Seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(balanced_df['genre']), yticklabels=np.unique(balanced_df['genre']))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()