### In this file, I just tried to build a csv file, contains embedding of our users !

In [7]:
import re
import ast
import warnings

def extract_genres(s):
    return re.findall(r'\{([^}]+)\}', s)

In [8]:
def one_hot_encode_genre(genre, unique_genres):
    
    encoding = [0] * len(unique_genres)  
    genre_index = unique_genres.index(genre) 
    encoding[genre_index] = 1 
    return encoding

In [9]:
specified_genres = ['Hip Hop', 'Pop', 'Rock']
unique_genres = ["Pop", "Rock", "Hip Hop", "Others"]

time_day = {"00:00:00-6:00:00": [1, 0, 0, 0], "6:00:00-12:00:00": [0, 1, 0, 0],
            "12:00:00-18:00:00": [0, 0, 1, 0], "18:00:00-00:00:00": [0, 0, 0, 1]}

In [10]:
# Function to encode time_day with one-hot vectors
def encode_time_day(time_string):
    
    for interval, encoding in time_day.items():
        
        start, end = interval.split("-")
        if time_string >= start and time_string <= end:
            
            return encoding

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import string
import torch

warnings.filterwarnings('ignore')

MODEL_NAME = "roberta-base"

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
model = RobertaModel.from_pretrained(MODEL_NAME)


def preprocess_lyrics(lyrics):
  
    lyrics = lyrics.lower()
    lyrics = lyrics.translate(str.maketrans('', '', string.punctuation))
    return lyrics


lyrics_table = "musics"
lyrics_column = "cleaned_lyric"


query = f"SELECT {lyrics_column}, author_id, disorder, created_at, timezone_offset, genres FROM {lyrics_table} WHERE {lyrics_column} IS NOT NULL AND genres IS NOT NULL AND cleaned_lyric != 'NA'"


lyrics_data = pd.read_sql_query(query, conn)

lyrics_data['created_at'] = pd.to_datetime(lyrics_data['created_at'])
lyrics_data['adjusted_created_at'] = (lyrics_data['created_at'] + pd.to_timedelta(lyrics_data['timezone_offset'], unit='s'))
lyrics_data['time'] = pd.to_datetime(lyrics_data['adjusted_created_at']).dt.strftime('%H:%M:%S')
lyrics_data['music_date_time'] = pd.to_datetime(lyrics_data['adjusted_created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')

# Sort by adjusted_created_at within each user group
sorted_data = lyrics_data.sort_values(by=['author_id', 'adjusted_created_at'])


model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

final_data = {
    
    "author_id" : [],
    "disorder": [],
    "music_embedding":[],
    "music_session" : [],
    "genre_embedding":[]
}

def embed_lyrics(lyrics):
    
    inputs = tokenizer(lyrics, return_tensors="pt", padding=True, truncation=True)
    
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings


for index, users in sorted_data.iterrows():
    
    music_lyric = users["cleaned_lyric"]
    embedded_lyric = embed_lyrics(music_lyric)
    embedded_lyric_list = embedded_lyric.tolist()
    
    genres_list = users["genres"]

    if genres_list.startswith('{') and genres_list.endswith('}'):
        
        author_musics_genres = extract_genres(genres_list)
        cleaned_genres = [genre.replace("'", "").replace('"', '') for genre in author_musics_genres]
    else:
        cleaned_genres = [genre.strip("'\"") for genre in genres_list.split(',')]
        
    matching_genres = [genre for genre in cleaned_genres if genre in specified_genres]
    
    genre = "Others"
    if len(matching_genres) > 0:
        genre = matching_genres[0]
    encoded_genre = one_hot_encode_genre(genre, unique_genres)
    
    
    #encode time of listening to music
    time_encoded = encode_time_day(users['time'])

    
    #econde stress after 1.5h
    
    disorder = users['disorder']
    author_id = users["author_id"]
    adjusted_created_at = users['music_date_time']
    
    directory_path = f"{disorder}/{author_id}"
    json_file_path = f"{directory_path}/music_session.json"
    
    session = -1
    
    try:
        with open(json_file_path, 'r') as json_file:

            music_session_dict = json.load(json_file)
        
        session = music_session_dict[adjusted_created_at]
    except:
        print("Error !")
    
    
    final_data["music_session"].append(session)
    
#     query_after = f"SELECT * FROM tweets_after_musics where music_session={int(session)} and author_id='{author_id}' and interval IN ('30m','1h','1.5h')"

    query_after = f"""SELECT * FROM tweets_after_musics WHERE music_session = {session} AND author_id = '{author_id}' AND interval = '30m'
            UNION ALL
            SELECT *
            FROM tweets_after_musics
            WHERE music_session = {session} AND author_id = '{author_id}' AND interval = '1h'
              AND NOT EXISTS (
                SELECT * FROM tweets_after_musics
                WHERE music_session = {session} AND author_id = '{author_id}' AND interval = '30m'
              )
            UNION ALL
            SELECT *
            FROM tweets_after_musics
            WHERE music_session = {session} AND author_id = '{author_id}' AND interval = '1.5h'
              AND NOT EXISTS (
                SELECT * FROM tweets_after_musics
                WHERE music_session = {session} AND author_id = '{author_id}' AND (interval = '30m' OR interval = '1h')
              ); """""
    
    author_data_after = pd.read_sql_query(query_after, conn)
    
    stress_after = -100
    
    if len(author_data_after) > 0:
        
        sum_after_stress = 0

        for _, row in author_data_after.iterrows():

            sum_after_stress += int(row["stress_level"])

        stress_after = sum_after_stress / len(author_data_after)
        stress_after = abs(stress_after) - 1
    
    # embed group user
    is_control = 0
    if disorder == 'control':
        
        is_control = 1
    
    final_data["author_id"].append(author_id)
    final_data["disorder"].append(disorder)
    
    music_embedding = [is_control]
    music_embedding.extend(embedded_lyric_list)
    music_embedding.extend(encoded_genre)
    music_embedding.extend(time_encoded)
    music_embedding.append(stress_after)
    
    final_data["music_embedding"].append(music_embedding)
    final_data["genre_embedding"].append(encoded_genre)
    
    
df = pd.DataFrame(final_data)

df.to_csv('music_embedding.tsv', index=False)


In [18]:
df = pd.read_csv('music_embedding_3.tsv')
df['music_embedding'] = df['music_embedding'].apply(ast.literal_eval)
df.head()

Unnamed: 0,author_id,disorder,music_embedding,music_session,genre_embedding
0,1000026915590189062,control,"[1, -0.030979907140135765, 0.01254077535122633...",2,"[0, 0, 0, 1]"
1,1000123379356233729,control,"[1, 0.007308566011488438, -0.03472823277115822...",15,"[1, 0, 0, 0]"
2,1000123379356233729,control,"[1, -0.017335733398795128, 0.05415014922618866...",7,"[0, 0, 1, 0]"
3,1000123379356233729,control,"[1, 0.06611811369657516, 0.045194048434495926,...",10,"[0, 0, 0, 1]"
4,1000123379356233729,control,"[1, 0.005866596009582281, 0.06480590999126434,...",3,"[0, 0, 0, 1]"


In [16]:
df.iloc[:1]["music_embedding"]

0    [1, [-0.030979907140135765, 0.0125407753512263...
Name: music_embedding, dtype: object

In [6]:
df = pd.read_csv('music_embedding.tsv')

for index, row in df.iterrows():
    
    embedding_list = ast.literal_eval(row['music_embedding'])
    new_list = [embedding_list[0]]
    first_element = embedding_list[1]
    new_list.extend(first_element)
    new_list.extend(embedding_list[2:])
    
    df.at[index, 'music_embedding'] = new_list

df.to_csv('music_embedding_3.tsv', index=False)