### In this file, I just tried to build a csv file, contains embedding of our users !

In [2]:
import re
import ast
import warnings

def extract_genres(s):
    return re.findall(r'\{([^}]+)\}', s)

In [3]:
def one_hot_encode_genre(genre, unique_genres):
    
    encoding = [0] * len(unique_genres)  
    genre_index = unique_genres.index(genre) 
    encoding[genre_index] = 1 
    return encoding

In [4]:
specified_genres = ['Hip Hop', 'Pop', 'Rock', 'Electronic', 'Jazz']
unique_genres = ["Pop", "Rock", "Electronic", "Hip Hop", "Jazz", "Others"]

time_day = {"00:00:00-6:00:00": [1, 0, 0, 0], "6:00:00-12:00:00": [0, 1, 0, 0],
            "12:00:00-18:00:00": [0, 0, 1, 0], "18:00:00-00:00:00": [0, 0, 0, 1]}

In [5]:
# Function to encode time_day with one-hot vectors
def encode_time_day(time_string):
    
    for interval, encoding in time_day.items():
        
        start, end = interval.split("-")
        if time_string >= start and time_string <= end:
            
            return encoding

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import string
import torch

warnings.filterwarnings('ignore')

MODEL_NAME = "roberta-base"

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
model = RobertaModel.from_pretrained(MODEL_NAME)


def preprocess_lyrics(lyrics):
  
    lyrics = lyrics.lower()
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    return lyrics


lyrics_table = "musics"
lyrics_column = "cleaned_lyric"


query = f"SELECT {lyrics_column}, author_id, disorder, created_at, timezone_offset, genres FROM {lyrics_table} WHERE {lyrics_column} IS NOT NULL AND genres IS NOT NULL AND cleaned_lyric != 'NA'"


lyrics_data = pd.read_sql_query(query, conn)

lyrics_data['created_at'] = pd.to_datetime(lyrics_data['created_at'])
lyrics_data['adjusted_created_at'] = (lyrics_data['created_at'] + pd.to_timedelta(lyrics_data['timezone_offset'], unit='s'))
lyrics_data['time'] = pd.to_datetime(lyrics_data['adjusted_created_at']).dt.strftime('%H:%M:%S')


# Sort by adjusted_created_at within each user group
sorted_data = lyrics_data.sort_values(by=['author_id', 'adjusted_created_at'])


model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

final_data = {
    
    "author_id" : [],
    "disorder": [],
    "music_embedding":[],
    "genre_embedding":[]
}

def embed_lyrics(lyrics):
    
    inputs = tokenizer(lyrics, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings


for index, users in sorted_data.iterrows():
    
    #Embedd the lyrics
    music_lyric = users["cleaned_lyric"]
    embedded_lyric = embed_lyrics(music_lyric)
    
    #Embedd genres
    genres_list = users["genres"]

    if genres_list.startswith('{') and genres_list.endswith('}'):
        
        author_musics_genres = extract_genres(genres_list)
        cleaned_genres = [genre.replace("'", "").replace('"', '') for genre in author_musics_genres]
    else:
        cleaned_genres = [genre.strip("'\"") for genre in genres_list.split(',')]
        
    matching_genres = [genre for genre in cleaned_genres if genre in specified_genres]
    
    genre = "Others"
    if len(matching_genres) > 0:
        genre = matching_genres[0]
    encoded_genre = one_hot_encode_genre(genre, unique_genres)
    
    
    #encode time of listening to music
    time_encoded = encode_time_day(users['time'])

    
    final_data["author_id"].append(users["author_id"])
    final_data["disorder"].append(users["disorder"])
    
    music_embedding = embedded_lyric.tolist()
    music_embedding.extend(encoded_genre)
    music_embedding.extend(time_encoded)
    
    final_data["music_embedding"].append(music_embedding)
    final_data["genre_embedding"].append(encoded_genre)
    
    
df = pd.DataFrame(final_data)

df.to_csv('music_embedding.tsv', index=False)


In [8]:
df = pd.read_csv('music_embedding_2.tsv')
df['music_embedding'] = df['music_embedding'].apply(ast.literal_eval)
df.head()

Unnamed: 0,author_id,disorder,music_embedding,genre_embedding
0,1000026915590189062,control,"[-0.030979907140135765, 0.01254077535122633, 0...","[0, 0, 0, 0, 0, 1]"
1,1000123379356233729,control,"[0.007308566011488438, -0.03472823277115822, 0...","[1, 0, 0, 0, 0, 0]"
2,1000123379356233729,control,"[-0.017335733398795128, 0.05415014922618866, 0...","[0, 0, 0, 1, 0, 0]"
3,1000123379356233729,control,"[0.06611811369657516, 0.045194048434495926, 0....","[0, 0, 0, 0, 1, 0]"
4,1000123379356233729,control,"[0.005866596009582281, 0.06480590999126434, 0....","[0, 0, 0, 0, 0, 1]"


In [12]:
len(df.iloc[:1]["music_embedding"])

1

In [7]:
df = pd.read_csv('music_embedding.tsv')

for index, row in df.iterrows():
    
    embedding_list = ast.literal_eval(row['music_embedding'])
    first_element = embedding_list[0]
    first_element.extend(embedding_list[1:]) 
    df.at[index, 'music_embedding'] = first_element

df.to_csv('music_embedding_2.tsv', index=False)