# Analysis of Musical and Lyrical Trends at Intelligent Interactive Systems (MIIS)

#### Oktay Ozan Güner   -  ID : OZAN_ID
#### Juan Miguel Alfonso Habana   -  ID : MIGUEL_ID

# Introduction

Since 1990, the way people interact with music has evolved significantly. This period marks a transition from the tangible, physical media of CDs and vinyl to the intangible, yet infinitely accessible world of digital music. 
* How digitalization and streaming have influenced listeners? 
* How the listening habits have changed over time?

We'll examine how our music listening habits have been affected from duration of the songs to the way sentiment of the lyrics.


In [None]:
# Importing related packages
import pandas as pd
import numpy as np

In [2]:
# Loading final version of data
data = pd.read_csv("Final_Data_20240309.csv")

In [4]:
# Flagging the lyrics whether they contain chorus part for topic modeling.
data["Flag_Chorus"] = data["Translated_Lyrics_2"].apply(lambda x: r"[Chorus" in x).astype(int)

In [5]:
# Comparing the number of songs that contain chorus part with all songs.
data["Flag_Chorus"].sum(), len(data)

(9988, 12263)

In [6]:
# Selecting the songs that contain chorus part.
df = data.loc[data["Flag_Chorus"]==1].reset_index(drop=True)

In [9]:
import re

def extract_chorus(lyrics):
    """
    Removes unrelated characters from the texts.

    Parameters:
    - lyrics (str): The raw text that wanted to be extracted chorus part.

    Returns:
    string: representing chorus part of the raw text.
    """

    
    # Regex pattern explanation:
    # \[Chorus - Matches the literal string "[Chorus"
    # [\s\S]*? - Non-greedy match for any character including new lines, allowing for any content (e.g., " 2", " : Kanye") within the brackets
    # \] - Matches the closing bracket ']'
    # (.*?) - Non-greedy match for any characters, capturing them until the next pattern
    # (?=\n\[|\Z) - Positive lookahead for either the start of a new section (newline + '[') or end of string (\Z), without consuming characters
    pattern = r'\[Chorus[\s\S]*?\](.*?)(?=\n\[)'
    
    # Find all non-overlapping matches of the pattern
    matches = re.findall(pattern, lyrics, flags=re.DOTALL)
    
    # Join matches with a newline, trimming leading/trailing whitespace from each match
    chorus_parts = "\n".join(match.strip() for match in matches)
    return chorus_parts

# Example usage:
lyrics = """
[Chorus]
Big girls don't cry (They don't cry)
Big girls don't cry (Who said they don't cry?)
My girl said goodbye (My-oh-my)
My girl didn't cry (I wonder why)

[Verse 1]
(Silly boy) Told my girl we had to break up
(Silly boy) Thought that she would call my bluff
(Silly boy) But she said to my surprise
"Big girls don't cry"

[Chorus 2]
Big girls don't cry (They don't cry)
Big girls don't cry (Who said they don't cry?)
My girl said goodbye (My-oh-my)
My girl didn't cry (I wonder why)

[Chorus : Kanye]
Big girls don't cry (They don't cry)
Big girls don't cry (Who said they don't cry?)
My girl said goodbye (My-oh-my)
My girl didn't cry (I wonder why)
"""

chorus = extract_chorus(lyrics)
print(chorus)


Big girls don't cry (They don't cry)
Big girls don't cry (Who said they don't cry?)
My girl said goodbye (My-oh-my)
My girl didn't cry (I wonder why)
Big girls don't cry (They don't cry)
Big girls don't cry (Who said they don't cry?)
My girl said goodbye (My-oh-my)
My girl didn't cry (I wonder why)


In [10]:
# Getting chorus parts of the songs.
df["Chorus"] = df["Translated_Lyrics_2"].apply(lambda x: "\n".join(list(set(extract_chorus(x).split("\n")))))

In [11]:
# Getting chorus length.
df["Chorus_Length"] = df["Chorus"].apply(lambda x: len(x.split()))

In [14]:
# Checking chorus length distribution
df["Chorus_Length"].describe([0.01,0.05,0.1])

count    9988.000000
mean       83.831398
std        60.903900
min         0.000000
1%          4.870000
5%         18.000000
10%        27.000000
50%        68.000000
max       497.000000
Name: Chorus_Length, dtype: float64

In [15]:
# Removing choruses that the chorus length equal to zero.
df2 = df[df["Chorus_Length"]!=0].reset_index(drop=True)

In [16]:
def clean_lyrics(raw_lyrics:str):
  """
  Removes unrelated characters from the texts.

  Parameters:
  - raw_lyrics (str): The raw text that wanted to be removed unrelated characters.

  Returns:
  string: representing clean version of the raw text.
  """

    # Regex pattern to match section markers
    pattern = r'\[.+]'

    # Replace section markers with an empty string
    cleaned_lyrics = re.sub(pattern, '', raw_lyrics)

    # Remove leading and trailing whitespace and newlines
    cleaned_lyrics = cleaned_lyrics.strip()

    return cleaned_lyrics


def text_to_embeds(model, text:str, model_sequence_length:int, overlap:int):
  """
  Converts texts into chunks regarding to model sequence length. Then gets embedding representations of them by using average pooling.

  Parameters:
  - text (str): The texts that wanted to be represented as embedding.
  - model_sequence_length (int): The maximum sequence length of the model used.
  - overlap (int): The number of words at the end of one text chunk that will be included at the beginning of the next chunk when splitting the input text into segments for embedding. This parameter ensures continuity and contextual connection between consecutive chunks, especially important for maintaining semantic integrity in natural language processing tasks.
  
  Returns:
  numpy array: representing texts as embeddings.
  """  
  
  splitted_text = text.split()
  chunks = []
  if len(splitted_text) > model_sequence_length:
    for i in range(0, len(splitted_text), model_sequence_length-overlap):
      chunk = splitted_text[i:i+model_sequence_length]
      chunks.append(chunk)

      chunk_texts = [" ".join(c) for c in chunks]

      embeddings = [model.encode(chunk_text, convert_to_tensor=True) for chunk_text in chunk_texts]
      embeddings = np.mean(np.stack(embeddings), axis=0)

  else:
    embeddings = model.encode(text, convert_to_tensor=True).numpy()

  return embeddings

def outlier_cutoff(dataframe:pd.DataFrame, col:str, low_quantile:float, high_quantile:float):
  """
  Filters the data based on the quartiles of given column.

  Parameters:
  - dataframe (pd.DataFrame): The dataframe that includes column which wanted to be filtered.
  - col (str): The column that wanted to be filtered based on the quartiles.
  - low_quantile (float): Low limit of the quartile of given column.
  - high_quantile (float): Up limit of the quartile of given column.

  Returns:
  pandas dataframe: representing remaining values between the specified quartiles.
  """

  return dataframe[(dataframe[col]<dataframe[col].quantile(high_quantile)) & (dataframe[col]>dataframe[col].quantile(low_quantile))]

In [17]:
# Data Cleaning
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')


df2["Tokenized_Chorus"] = df2["Chorus"].apply(lambda text: clean_lyrics(text) if pd.notna(text) else text)
df2["Tokenized_Chorus"] = df2["Tokenized_Chorus"].apply(lambda x: x.replace("*vocalizations*", "").strip() if pd.notna(x) else x)
df2["Tokenized_Chorus"] = df2["Tokenized_Chorus"].str.rstrip("Embed")
df2["Tokenized_Chorus"] = df2["Tokenized_Chorus"].apply(lambda text: re.sub(r'Contributors|Lyrics|Contributor', '', text) if pd.notna(text) else text)
df2["Tokenized_Chorus"] = df2["Tokenized_Chorus"].apply(lambda text: " ".join(tokenizer.tokenize(text.lower())) if pd.notna(text) else text)    # Regex Tokenization

# CHORUS EMBEDDINGS

In [21]:
# Loading pretrained sentence transformer embedding model. 'all-mpnet-base-v2' is used for Sentiment Analysis, while 'all-MiniLM-L6-v2' is used for Topic Modeling.
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")
max_seq_length = embed_model.max_seq_length
max_seq_length      # maximum sequence length of pretrained model.

256

In [23]:
# Getting embeddings of all choruses.
from tqdm import tqdm
import time

start_time = time.time()
overlap=5
model_embeds_dimension = embed_model.get_sentence_embedding_dimension()
embeds_arr = np.empty(shape=(len(df2), model_embeds_dimension))

for i in tqdm(range(len(df2["Tokenized_Chorus"])), desc="Embedding Texts"):
  text_embedding = text_to_embeds(embed_model, df2["Tokenized_Chorus"][i], max_seq_length, overlap)

  embeds_arr[i] = text_embedding

end_time = time.time()

print(f"Process Time : {(end_time-start_time)/60} minutes")

Embedding Texts: 100%|██████████| 9903/9903 [08:26<00:00, 19.57it/s]

Process Time : 8.43565754890442 minutes





In [27]:
# Saving chorus embeddings and the final version of chorus data.
np.save('Embeds_Chorus_20240312.npy', embeds_arr)
df2.to_csv("Chorus_Data_20240312.csv")