# 開雲端空間

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 在colab用Git的方法

In [None]:
import os
os.chdir('/content/drive/MyDrive/Datagame-2023')
!git add .

# 方法1
專門處理聽的歌曲數<=5的session_id。  
處理聽的歌曲數>5的session_id為方法三。

## 聽過的總歌曲數(重複聽的記為一次)<=5

### 賦予變數

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

# 指定 Parquet 檔案的路徑
train_source_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_train_source.parquet'
train_target_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_train_target.parquet'
test_source_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_test_source.parquet'
meta_song_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song.parquet'
song_composer_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_composer.parquet'
song_genre_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_genre.parquet'
song_lyricist_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_lyricist.parquet'
song_producer_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_producer.parquet'
song_titletext_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_titletext.parquet'

train_df = pd.read_parquet(train_source_file_path)

In [None]:
meta_song_df = pd.read_parquet(meta_song_file_path)
# 将 NaN 值替换为 0，并转换为整数
meta_song_df['album_id'] = meta_song_df['album_id'].fillna(0).astype(int)
meta_song_df['artist_id'] = meta_song_df['artist_id'].fillna(0).astype(int)

train_meta_df = pd.merge(train_df, meta_song_df[['song_id', 'artist_id', 'album_id']], on='song_id', how='left') #透過song_id把meta合併到train裡面
album_song_count = train_meta_df.groupby('album_id')['song_id'].value_counts() #每個album的song出現的次數加總
artist_song_count = train_meta_df.groupby('artist_id')['song_id'].value_counts() #每個artist的song出現的次數加總
song_count = train_meta_df['song_id'].value_counts()

song_id_mapping = dict(zip(meta_song_df['song_id'], zip(meta_song_df['album_id'], meta_song_df['artist_id']))) #song_id為key，album_id跟artist_id為value

del train_df
del meta_song_df

### 做出每個session_id對應的album_id跟artist_id的播放數 (session_album_artist)

#### 快速版的

In [None]:
unique_counts = train_meta_df.groupby('session_id')['song_id'].nunique()
under_5_counts = unique_counts[unique_counts <= 5].index.tolist()
selected_rows = train_meta_df[train_meta_df['session_id'].isin(under_5_counts)]
session_album_artist = {}
# 遍历每个 session_id 的 song_id
for session_id, song_id in zip(selected_rows['session_id'], selected_rows['song_id']):
    if song_id not in song_id_mapping:
        continue

    # 获取专辑和艺术家
    album_id, artist_id = song_id_mapping[song_id]

    # 获取局部变量以减少嵌套访问
    session_albums = session_album_artist.setdefault(session_id, {'albums': {}})
    session_artists = session_albums.setdefault('artists', {})

    # album_id 非 nan
    if not np.isnan(album_id):
        album_id = int(album_id)
        session_albums['albums'][album_id] = session_albums['albums'].get(album_id, 0) + 1

    # artist_id 非 nan
    if not np.isnan(artist_id):
        artist_id = int(artist_id)
        session_artists[artist_id] = session_artists.get(artist_id, 0) + 1

    session_album_artist[session_id]['albums'] = dict(sorted(session_album_artist[session_id]['albums'].items(), key=lambda x: x[1], reverse=True))
    session_album_artist[session_id]['artists'] = dict(sorted(session_album_artist[session_id]['artists'].items(), key=lambda x: x[1], reverse=True))



#### 慢速版的

In [None]:
# result_dict = {}

# # 遍历每个session_id的song_id
# for session_id, song_id in zip(selected_rows['session_id'][2000:2100], selected_rows['song_id'][2000:2100]):
#     if song_id not in meta_song_df['song_id'].values:
#       # 可以选择忽略该条记录或者采取其他处理方式
#       continue

#     # 找到对应的专辑和艺术家
#     album_id = meta_song_df[meta_song_df['song_id']==song_id]['album_id'].values[0]
#     artist_id = meta_song_df[meta_song_df['song_id']==song_id]['artist_id'].values[0]

#     # 更新字典
#     if session_id not in result_dict:
#         result_dict[session_id] = {'albums': {}, 'artists': {}}

#     # album_id非nan
#     if not np.isnan(album_id):
#       album_id = album_id.astype(int)
#       # 统计专辑数量
#       if album_id not in result_dict[session_id]['albums']:

#           result_dict[session_id]['albums'][album_id] = 1
#       else:
#           result_dict[session_id]['albums'][album_id] += 1

#     # artist_id非nan
#     if not np.isnan(artist_id):
#       artist_id = artist_id.astype(int)
#       # 统计艺术家数量
#       if artist_id not in result_dict[session_id]['artists'] and not np.isnan(artist_id):
#           result_dict[session_id]['artists'][artist_id] = 1
#       else:
#           result_dict[session_id]['artists'][artist_id] += 1

# # 输出结果
# print(result_dict)


### 把所有播放過的歌曲放進去 (session_dict)

In [None]:
selected_rows_sorted = selected_rows.sort_values(['session_id', 'unix_played_at'])
# 使用 drop_duplicates 保留每個 session_id 中相同 song_id 的最後一行
final_selected_rows = selected_rows_sorted.drop_duplicates(['session_id', 'song_id'], keep='last')[['session_id','song_id']]
session_dict = {}

# 遍历数据框的每一行
for index, row in final_selected_rows.iterrows():
    session_id = row['session_id']
    song_id = row['song_id']

    # 如果session_id不在字典中，创建一个新的列表
    if session_id not in session_dict:
        session_dict[session_id] = []

    # 将歌曲添加到session_id对应的列表中
    session_dict[session_id].append(song_id)
    # 创建一个字典，将 song_id 映射到专辑和艺术家

### 填補沒放完5個的空缺

In [None]:
for session_id in session_dict:
  # 有空缺的話
  if len(session_dict[session_id])<5:
    if not session_album_artist[session_id]['albums']=={}:
      albums = session_album_artist[session_id]['albums']
      for album in albums:
        # 此album播放的次數必須大於3，代表此用戶在聽此album
        if albums[album]>3:
          # 抓出專輯裡的所有歌曲由高到低排名
          for song_id in album_song_count.loc[album].index.get_level_values('song_id').tolist():
            if song_id not in session_dict[session_id]:
              session_dict[session_id].append(song_id)
            if len(session_dict[session_id])>=5:
              break
        else:
          break
        if len(session_dict[session_id])>=5:
          break
  else:
    continue
  # 還是有空缺的話
  if len(session_dict[session_id])<5:
    if not session_album_artist[session_id]['artists']=={}:
      artists = session_album_artist[session_id]['artists']
      for artist in artists:
        # 此artists播放的次數必須大於3，代表此用戶在聽此artists
        if artists[artist]>3:
          # 抓出artists的所有歌曲由高到低排名
          for song_id in artist_song_count.loc[artist].index.get_level_values('song_id').tolist():
            if song_id not in session_dict[session_id]:
              session_dict[session_id].append(song_id)
            if len(session_dict[session_id])>=5:
              break
        else:
          break
        if len(session_dict[session_id])>=5:
          break
  else:
    continue
  if len(session_dict[session_id])<5:
    for song_id in song_count.index:
      if song_id not in session_dict[session_id]:
        session_dict[session_id].append(song_id)
      if len(session_dict[session_id])>=5:
        break

In [None]:
all_lengths_are_5 = all(len(lst) == 5 for lst in session_dict.values())

# 打印结果
if all_lengths_are_5:
    print("所有列表的长度都为5。")
else:
    print("有一个或多个列表的长度不为5。")


所有列表的长度都为5。


## 做出submission

In [None]:
# 创建一个新的数据框，其中每个session_id都有其对应的top1到top5
submission_data = {'session_id': [], 'top1': [], 'top2': [], 'top3': [], 'top4': [], 'top5': []}

for session_id, songs in session_dict.items():
    submission_data['session_id'].append(session_id)
    submission_data['top1'].append(songs[0])
    submission_data['top2'].append(songs[1])
    submission_data['top3'].append(songs[2])
    submission_data['top4'].append(songs[3])
    submission_data['top5'].append(songs[4])

# 创建结果数据框
submission = pd.DataFrame(submission_data)

# 拿有規律的以外的剩餘session_id都排名同個字串'00000000000000000000000000000000'(使之無意義)
###########################################
session_ids_to_replace = set(submission['session_id'])
session_id_list = train_df.loc[~train_df['session_id'].isin(session_ids_to_replace)]['session_id'].drop_duplicates().values


new_data = {'session_id': session_id_list, 'top1': ['00000000000000000000000000000000']*len(session_id_list), 'top2': ['00000000000000000000000000000000']*len(session_id_list), 'top3': ['00000000000000000000000000000000']*len(session_id_list), 'top4': ['00000000000000000000000000000000']*len(session_id_list), 'top5': ['00000000000000000000000000000000']*len(session_id_list)}
new_df = pd.DataFrame(new_data)

# 将原始的 submission 表格和新的数据合并
submission = pd.concat([submission, new_df], ignore_index=True)
submission['session_id'] = submission['session_id'].astype(int)

submission.to_csv('/content/drive/MyDrive/Datagame-2023/test/submission.csv',index=False)

# 方法2
認為有一部分的歌曲播放不是來自於session_id自己點選的，而是來自系統自動播放的。所以先填補session_id自己想聽的歌曲，再找出系統自動播放的循環歌曲填補。但運算速度太慢就放棄。最後是用方法四完成

### 做出每個session_id對應的album_id跟artist_id的播放數 (session_album_artist)



#### 快速版的

In [None]:
session_album_artist = {}
# 遍历每个 session_id 的 song_id
for session_id, song_id in zip(selected_rows['session_id'], selected_rows['song_id']):
    if song_id not in song_id_mapping:
        continue

    # 获取专辑和艺术家
    album_id, artist_id = song_id_mapping[song_id]

    # 获取局部变量以减少嵌套访问
    session_albums = session_album_artist.setdefault(session_id, {'albums': {}})
    session_artists = session_albums.setdefault('artists', {})

    # album_id 非 nan
    if not np.isnan(album_id):
        album_id = int(album_id)
        session_albums['albums'][album_id] = session_albums['albums'].get(album_id, 0) + 1

    # artist_id 非 nan
    if not np.isnan(artist_id):
        artist_id = int(artist_id)
        session_artists[artist_id] = session_artists.get(artist_id, 0) + 1

    session_album_artist[session_id]['albums'] = dict(sorted(session_album_artist[session_id]['albums'].items(), key=lambda x: x[1], reverse=True))
    session_album_artist[session_id]['artists'] = dict(sorted(session_album_artist[session_id]['artists'].items(), key=lambda x: x[1], reverse=True))

#### 慢速版的

In [None]:
# result_dict = {}

# # 遍历每个session_id的song_id
# for session_id, song_id in zip(selected_rows['session_id'][2000:2100], selected_rows['song_id'][2000:2100]):
#     if song_id not in meta_song_df['song_id'].values:
#       # 可以选择忽略该条记录或者采取其他处理方式
#       continue

#     # 找到对应的专辑和艺术家
#     album_id = meta_song_df[meta_song_df['song_id']==song_id]['album_id'].values[0]
#     artist_id = meta_song_df[meta_song_df['song_id']==song_id]['artist_id'].values[0]

#     # 更新字典
#     if session_id not in result_dict:
#         result_dict[session_id] = {'albums': {}, 'artists': {}}

#     # album_id非nan
#     if not np.isnan(album_id):
#       album_id = album_id.astype(int)
#       # 统计专辑数量
#       if album_id not in result_dict[session_id]['albums']:

#           result_dict[session_id]['albums'][album_id] = 1
#       else:
#           result_dict[session_id]['albums'][album_id] += 1

#     # artist_id非nan
#     if not np.isnan(artist_id):
#       artist_id = artist_id.astype(int)
#       # 统计艺术家数量
#       if artist_id not in result_dict[session_id]['artists'] and not np.isnan(artist_id):
#           result_dict[session_id]['artists'][artist_id] = 1
#       else:
#           result_dict[session_id]['artists'][artist_id] += 1

# # 输出结果
# print(result_dict)


### 找循環以及填補5個剩餘的空缺

In [None]:
session_ids = train_meta_df['session_id'].unique()
for session_id in session_ids:
  # 从最后一行开始往上追踪
  song_20 = train_meta_df[train_meta_df['session_id']==session_id].drop_duplicates(['session_id', 'song_id'], keep='last')['song_id'].values
  cycle_song = [song_20[len(song_20)-1]]
  # 從最後開始往前找
  for i in range(len(song_20)-2,-1,-1):
    song_id = song_20[i]
    # cycle_song不能有重複的歌曲
    if song_id not in cycle_song:
      cycle_song.append(song_id)
    # 如果後面遇到跟最後一首歌一樣的話，開始尋找current_match的最長max_match
    elif song_id == cycle_song[0]:
      break
  max_match = []
  current_match = []
  b = len(song_20)-1-len(cycle_song) # song_20的位置
  a = 0 # cycle_song的位置

  while b > -1:
    while True:
      if song_20[b] == cycle_song[a] and a < len(cycle_song):
        current_match.append(song_20[b])
        b -= 1
        a += 1
      else:
        if len(current_match)>len(max_match):
          max_match = current_match.copy()
          current_match = []
          a = 0
        break
    b -= 1
  if len(max_match)>1:
    for song_id in reversed(cycle_song):
      if len(session_dict[session_id])<5:
        session_dict[session_id].append(song_id)
      else:
        break
    session_ids.remove(session_id)

# 剩下是沒補完5個空缺的填補方式
  # 找歌曲重複播放次數>=三次的
  # session_id重複播放歌曲的次數
  for song_id, count in selected_rows[selected_rows['session_id']==session_id]['song_id'].value_counts().iteritems():
    if len(session_dict[session_id])==5 or count<3:
      break
    else:
      session_dict[session_id].append(song_id)
  # 透過album填補
  if len(session_dict[session_id])<5:
    if not session_album_artist[session_id]['albums']=={}:
      albums = session_album_artist[session_id]['albums']
      for album in albums:
        # 此album播放的次數必須大於3，代表此用戶在聽此album
        if albums[album]>3:
          # 抓出專輯裡的所有歌曲由高到低排名
          for song_id in album_song_count.loc[album].index.get_level_values('song_id').tolist():
            if song_id not in session_dict[session_id]:
              session_dict[session_id].append(song_id)
            if len(session_dict[session_id])>=5:
              break
        else:
          break
        if len(session_dict[session_id])>=5:
          break
  else:
    continue
  # 透過artist填補
  if len(session_dict[session_id])<5:
    if not session_album_artist[session_id]['artists']=={}:
      artists = session_album_artist[session_id]['artists']
      for artist in artists:
        # 此artists播放的次數必須大於3，代表此用戶在聽此artists
        if artists[artist]>3:
          # 抓出artists的所有歌曲由高到低排名
          for song_id in artist_song_count.loc[artist].index.get_level_values('song_id').tolist():
            if song_id not in session_dict[session_id]:
              session_dict[session_id].append(song_id)
            if len(session_dict[session_id])>=5:
              break
        else:
          break
        if len(session_dict[session_id])>=5:
          break
  else:
    continue
  # 從最紅的歌開始填補
  if len(session_dict[session_id])<5:
    for song_id in song_count.index:
      if song_id not in session_dict[session_id]:
        session_dict[session_id].append(song_id)
      if len(session_dict[session_id])>=5:
        break


In [None]:
# 创建一个新的数据框，其中每个session_id都有其对应的top1到top5
submission_data = {'session_id': [], 'top1': [], 'top2': [], 'top3': [], 'top4': [], 'top5': []}

for session_id, songs in session_dict.items():
    submission_data['session_id'].append(session_id)
    submission_data['top1'].append(songs[0])
    submission_data['top2'].append(songs[1])
    submission_data['top3'].append(songs[2])
    submission_data['top4'].append(songs[3])
    submission_data['top5'].append(songs[4])

# 创建结果数据框
submission = pd.DataFrame(submission_data)

# 拿有規律的以外的剩餘session_id都排名同個字串(使之無意)
###########################################
session_ids_to_replace = set(submission['session_id'])
session_id_list = train_df.loc[~train_df['session_id'].isin(session_ids_to_replace)]['session_id'].drop_duplicates().values


new_data = {'session_id': session_id_list, 'top1': ['00000000000000000000000000000000']*len(session_id_list), 'top2': ['00000000000000000000000000000000']*len(session_id_list), 'top3': ['00000000000000000000000000000000']*len(session_id_list), 'top4': ['00000000000000000000000000000000']*len(session_id_list), 'top5': ['00000000000000000000000000000000']*len(session_id_list)}
new_df = pd.DataFrame(new_data)

# 将原始的 submission 表格和新的数据合并
submission = pd.concat([submission, new_df], ignore_index=True)
submission['session_id'] = submission['session_id'].astype(int)

submission.to_csv('/content/drive/MyDrive/Datagame-2023/test/submission.csv',index=False)

## 播放過的歌曲數量為5


In [None]:
train_df = pd.read_parquet(test_source_file_path)
train_df = train_df[:]

In [None]:
unique_counts = train_df.groupby('session_id')['song_id'].nunique()

# 找出唯一數量為 5 的 session_id
unique_5_counts = unique_counts[unique_counts == 5].index.tolist()
selected_rows = train_df[train_df['session_id'].isin(unique_5_counts)]
selected_rows_sorted = selected_rows.sort_values(['session_id', 'unix_played_at'])
# 使用 drop_duplicates 保留每個 session_id 中相同 song_id 的最後一行
final_selected_rows = selected_rows_sorted.drop_duplicates(['session_id', 'song_id'], keep='last')[['session_id','song_id']]

In [None]:
session_dict = {}

# 遍历数据框的每一行
for index, row in final_selected_rows.iterrows():
    session_id = row['session_id']
    song_id = row['song_id']

    # 如果session_id不在字典中，创建一个新的列表
    if session_id not in session_dict:
        session_dict[session_id] = []

    # 将歌曲添加到session_id对应的列表中
    session_dict[session_id].append(song_id)

# 创建一个新的数据框，其中每个session_id都有其对应的top1到top5
submission_data = {'session_id': [], 'top1': [], 'top2': [], 'top3': [], 'top4': [], 'top5': []}

for session_id, songs in session_dict.items():
    submission_data['session_id'].append(session_id)
    submission_data['top1'].append(songs[0])
    submission_data['top2'].append(songs[1])
    submission_data['top3'].append(songs[2])
    submission_data['top4'].append(songs[3])
    submission_data['top5'].append(songs[4])

# 创建结果数据框
submission = pd.DataFrame(submission_data)

# 拿有規律的以外的剩餘session_id的排名都為無義字串
###########################################
session_ids_to_replace = set(submission['session_id'])
session_id_list = train_df.loc[~train_df['session_id'].isin(session_ids_to_replace)]['session_id'].drop_duplicates().values


new_data = {'session_id': session_id_list, 'top1': ['00000000000000000000000000000000']*len(session_id_list), 'top2': ['00000000000000000000000000000000']*len(session_id_list), 'top3': ['00000000000000000000000000000000']*len(session_id_list), 'top4': ['00000000000000000000000000000000']*len(session_id_list), 'top5': ['00000000000000000000000000000000']*len(session_id_list)}
new_df = pd.DataFrame(new_data)

# 将原始的 submission 表格和新的数据合并
submission = pd.concat([submission, new_df], ignore_index=True)
submission['session_id'] = submission['session_id'].astype(int)

submission.to_csv('/content/drive/MyDrive/Datagame-2023/test/submission.csv',index=False)

## 找循環

In [None]:
import pandas as pd
import numpy as np

A = [1, 2, 3, 1, 2, 3, 1, 2, 3]

# 创建 Pandas 数据框
df = pd.DataFrame({'value': A})

# 找出相邻重复元素的索引
duplicate_indices = df[df['value'].eq(df['value'].shift(-3))].index

# 计算重复元素之间的距离
distances = np.diff(duplicate_indices)

print(distances)


[1 1 1 1 1]


In [None]:
import pandas as pd

A = [1, 2, 3, 1, 2, 3, 1, 2, 3]

# 创建 Pandas 数据框
df = pd.DataFrame({'value': A})

# 找到和最后一个元素相同的前一个元素的位置
last_value = df['value'].iloc[-1]
last_value_index = df[df['value'].eq(last_value)].index[-2]

print(last_value_index)


5


# 方法3
處理聽的歌曲數 >5 的session_id。分成低、中、高三種密度群體，然後每一個群體都訓練一個模型。輸入為20個str list，輸出為5個str list，輸入輸出都是song_id。

### 分割高、低密度用戶群

In [None]:
# 去掉<=5的用戶
train_meta_df = train_meta_df[~train_meta_df['session_id'].isin(selected_rows['session_id'])]

# 找出高度集中歌曲的用戶: 前5首歌播放次數加總超過12次
song_id_counts = train_meta_df.groupby('session_id')['song_id'].value_counts()
top_5_song_total = song_id_counts.groupby('session_id').head(5).groupby('session_id').sum()
selected_rows_high_density_list = top_5_song_total[top_5_song_total>12].index.to_list()
selected_rows_high_density = train_meta_df[train_meta_df['session_id'].isin(selected_rows_high_density_list)]

# 找出中度集中歌曲的用戶: 至少重複播放次數2次以上的有3首以上
count_greater_than_two = song_id_counts[song_id_counts >= 2]
song_counts = count_greater_than_two.groupby('session_id').count()
selected_rows_high_density_list = song_counts[song_counts > 2].index.to_list()
# selected_rows_high_density_list = np.array(selected_rows_high_density_list)
# selected_rows_list = np.array(selected_rows_list)
# mask = np.isin( selected_rows_list, selected_rows_high_density_list, invert=True)
# selected_rows_medium_density_list = selected_rows_list[mask]
selected_rows_high_density = train_meta_df[train_meta_df['session_id'].isin(selected_rows_high_density_list)]

# 找出低度集中歌曲的用戶
selected_rows_low_density = train_meta_df[~train_meta_df['session_id'].isin(selected_rows_high_density_list)]

print(len(selected_rows_high_density))
print(len(selected_rows_low_density))
# unique_counts = train_meta_df.groupby('session_id')['album_id'].nunique()
# under_4_counts = unique_counts[unique_counts <= 4].index.tolist()
# selected_rows = train_meta_df[train_meta_df['session_id'].isin(under_4_counts)]
# selected_rows_sorted = selected_rows.sort_values(['session_id', 'unix_played_at'])
# # 使用 drop_duplicates 保留每個 session_id 中相同 song_id 的最後一行
# final_selected_rows = selected_rows_sorted.drop_duplicates(['session_id', 'song_id'], keep='last')[['session_id','song_id']]
# session_dict = {}

# # 遍历数据框的每一行
# for index, row in final_selected_rows.iterrows():
#     session_id = row['session_id']
#     song_id = row['song_id']

#     # 如果session_id不在字典中，创建一个新的列表
#     if session_id not in session_dict:
#         session_dict[session_id] = []

#     # 将歌曲添加到session_id对应的列表中
#     session_dict[session_id].append(song_id)
#     # 创建一个字典，将 song_id 映射到专辑和艺术家

2319080
8649020


## 預測模型

## 使用 wandb 繪製訓練圖

In [None]:
!pip install wandb
%env WANDB_LOG_MODEL=true
import wandb
wandb.login()
#127b81750f2af55c121c057c14a44d8254de404f

env: WANDB_LOG_MODEL=true


[34m[1mwandb[0m: Currently logged in as: [33mnrnmnrn[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## 創建新的或使用製作好的 tokenizer (後來發現不用做)

In [None]:
# #製作新的tokenizer
# train_df = pd.read_parquet(train_source_file_path)
# target_df = pd.read_parquet(train_target_file_path)

# i=0
# text=""
# while i<len(train_df):
#   for j in range(20):
#     text += train_df['song_id'][i+j]+" "
#   text += '\n'
#   i+=20

# i=0
# while i<len(target_df[:]):
#   for j in range(5):
#     text += target_df['song_id'][i+j]+" "
#   text += '\n'
#   i+=5

# file_path = "/content/drive/MyDrive/Datagame-2023/data/source_and_target_songs.txt"
# with open(file_path, "w", encoding="utf-8") as file:
#     file.write(text)

# from tokenizers import (
#     decoders,
#     models,
#     normalizers,
#     pre_tokenizers,
#     processors,
#     trainers,
#     Tokenizer,
# )

# def get_training_corpus():
#   with open(file_path, "r", encoding="utf-8") as file:
#     for line in file:
#       yield line

# tokenizer = Tokenizer(models.BPE())
# tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!"))
# trainer = trainers.BpeTrainer(vocab_size=660000, special_tokens=["<|endoftext|>"]) #sorce跟target的歌加起來不重複的歌共650000多首
# tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

# tokenizer.model = models.BPE()
# tokenizer.train(["/content/drive/MyDrive/Datagame-2023/data/source_and_target_songs.txt"], trainer=trainer)
# encoding = tokenizer.encode("Let's test this tokenizer.")
# print(encoding.tokens)

# tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
# sentence = "Let's test this tokenizer."
# encoding = tokenizer.encode(sentence)
# start, end = encoding.offsets[4]
# print(sentence[start:end])

# tokenizer.decoder = decoders.ByteLevel()
# print(tokenizer.decode(encoding.ids))

# tokenizer.save("/content/drive/MyDrive/Datagame-2023/data/source_and_target_songs_tokenizer.json")

# #取用存好的tokenizer使用
# tokenizer = Tokenizer.from_file("/content/drive/MyDrive/Datagame-2023/data/source_and_target_songs_tokenizer.json")
# encoding = tokenizer.encode(df['input'][0])
# tokenizer.decoder = decoders.WordPiece(prefix="##")
# print(tokenizer.decode(encoding))

SyntaxError: ignored

## 載入資料

In [None]:
target_df = pd.read_parquet(train_target_file_path)
grouped_target = target_df.groupby('session_id')['song_id'].apply(list).reset_index()
del target_df

def make_input_and_label(df):
  grouped_songs  = df[:].groupby('session_id')['song_id'].apply(list).reset_index(name='input')

  # 合并 grouped_songs 和 grouped_target
  grouped_songs = grouped_songs.merge(grouped_target, how='left', on='session_id')

  grouped_songs = grouped_songs.rename(columns={'song_id': 'label'})

  grouped_songs['input'] = grouped_songs['input'].apply(lambda x: ' '.join(x))
  grouped_songs['label'] = grouped_songs['label'].apply(lambda x: ' '.join(x))

  grouped_songs = grouped_songs.drop('session_id', axis=1)

  return grouped_songs

In [None]:
df = make_input_and_label(selected_rows_high_density)

### 自製index to song_id，後面發現tokenizer可以解決沒看過的字串的問題

In [None]:
# import pandas as pd
# import torch
# # input_20_songs_sequence = ['song_id_13 song_id_16 song_id_17 song_id_6 song_id_17 song_id_2 song_id_17 song_id_14 song_id_13 song_id_2 song_id_11 song_id_18 song_id_13 song_id_12 song_id_16 song_id_1 song_id_6 song_id_15 song_id_10 song_id_16', 'song_id_18 song_id_19 song_id_5 song_id_7 song_id_17 song_id_12 song_id_6 song_id_15 song_id_19 song_id_1 song_id_20 song_id_19 song_id_17 song_id_19 song_id_19 song_id_8 song_id_8 song_id_5 song_id_3 song_id_7']
# # output_5_songs_sequence = ['song_id_7 song_id_12 song_id_18 song_id_5 song_id_18', 'song_id_6 song_id_19 song_id_8 song_id_15 song_id_14']
# # data = {'input': input_20_songs_sequence, 'label': output_5_songs_sequence}
# # df = pd.DataFrame(data)
# # song_to_int = {f'song_id_{i}': i+1 for i in range(21)}
# # int_to_song = {i+1: f'song_id_{i+1}' for i in range(21)}
# # int_to_song[0] = '[PAD]'

# train_df = pd.read_parquet(train_source_file_path)
# meta_song_df = pd.read_parquet(meta_song_file_path)
# total_song_id = list(set(train_df['song_id'].to_list() + meta_song_df['song_id'].to_list()))
# total_song_size = len(total_song_id)
# del meta_song_df
# del train_df

# song_to_int = {total_song_id[i]: i+1 for i in range(total_song_size)}
# int_to_song = {i+1: total_song_id[i] for i in range(total_song_size)}
# int_to_song[0] = '[PAD]'

## 描述

In [None]:
#使用Huggingface的生成式模型，並訓練模型
#輸入為使用者聽的前20首歌，預測接下來會聽的5首歌
#預測的5首歌都要不同。若預測的5首歌有重複，則視為預測錯誤
#輸入的20首歌中，有可能有重複的歌曲

## 載入模型

In [None]:
#載入模型
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2")
model = AutoModelForCausalLM .from_pretrained("sshleifer/tiny-gpt2")

## 載入訓練資料

In [None]:
#載入訓練資料
from torch.utils.data import Dataset
from datetime import datetime
from sklearn.model_selection import train_test_split

class SongDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        # self.song_to_int = song_to_int
        self.max_len = 500

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        print(f"時間:", datetime.now())
        input_20_songs_sequence = self.df.iloc[idx]['input']
        output_5_songs_sequence = self.df.iloc[idx]['label']
        input_20_songs_sequence = tokenizer.encode(input_20_songs_sequence)
        output_5_songs_sequence = tokenizer.encode(output_5_songs_sequence)
        input_20_songs_sequence = input_20_songs_sequence
        output_5_songs_sequence = output_5_songs_sequence
        input_20_songs_sequence = input_20_songs_sequence + [0] * (self.max_len - len(input_20_songs_sequence))
        output_5_songs_sequence = output_5_songs_sequence + [0] * (self.max_len - len(output_5_songs_sequence))
        input_20_songs_sequence = torch.tensor(input_20_songs_sequence)
        output_5_songs_sequence = torch.tensor(output_5_songs_sequence)
        return input_20_songs_sequence, output_5_songs_sequence

train_ratio = 0.8
train_dataset, eval_dataset = train_test_split(df, test_size=(1 - train_ratio), random_state=42, shuffle=True)
# train_dataset = SongDataset(df, tokenizer, song_to_int)
train_dataset = SongDataset(train_dataset, tokenizer)
eval_dataset = SongDataset(eval_dataset, tokenizer)

## 訓練模型


In [None]:
#訓練模型
from transformers import Trainer, TrainingArguments, TrainerCallback
from transformers import default_data_collator
import torch

def my_data_collator(features):
    # Your custom data collation logic here
    input_20_songs_sequence, output_5_songs_sequence = zip(*features)

    # Padding sequences to the maximum length
    input_20_songs_sequence = torch.nn.utils.rnn.pad_sequence(input_20_songs_sequence, batch_first=True, padding_value=0)
    output_5_songs_sequence = torch.nn.utils.rnn.pad_sequence(output_5_songs_sequence, batch_first=True, padding_value=0)

    return {
        'input_ids': input_20_songs_sequence,  # Rename to 'input_ids'
        'labels': output_5_songs_sequence,  # Rename to 'labels'
    }

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.counter = 0
        self.best_loss = float('inf')

    def on_epoch_end(self, args, state, control, **kwargs):
        current_loss = state.log_metrics["eval_loss"]
        if current_loss < self.best_loss:
            self.best_loss = current_loss
            self.counter = 0
        else:
            self.counter += 1

        if self.counter >= self.patience:
            control.should_training_stop = True

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Datagame-2023/test/model',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    evaluation_strategy='steps',
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    report_to="wandb"
)

# Update the Trainer instantiation to include the callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=my_data_collator,
    callbacks=[EarlyStoppingCallback(patience=5)]
)

trainer.train()

## 使用範例

In [None]:
# 將生成的歌曲轉換為歌曲ID的列表
generated_song_ids = [song_to_int[song] for song in input_20_songs]

# 確保生成的5首歌曲都是唯一的
unique_generated_song_ids = list(set(generated_song_ids))

# 如果有歌曲重複，進行調整
while len(unique_generated_song_ids) < 5:
    # 重新生成缺失的歌曲
    missing_count = 5 - len(unique_generated_song_ids)
    missing_songs = model.generate(
        input_ids=input_20_songs_sequence,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        num_return_sequences=missing_count
    ).tolist()

    # 將新生成的歌曲加入列表
    for song_seq in missing_songs:
        song_id = song_seq[-1]  # 取每個序列的最後一個歌曲
        unique_generated_song_ids.append(song_id)

    # 移除重複的歌曲
    unique_generated_song_ids = list(set(unique_generated_song_ids))

# 將歌曲ID轉換為歌曲名稱
unique_generated_songs = [int_to_song[song_id] for song_id in unique_generated_song_ids]

print(unique_generated_songs)


In [None]:
input_20_songs = '6027767fad949f3ca5e772df04924949 041547bddb0a3e730f32db84c65868ca 041547bddb0a3e730f32db84c65868ca 041547bddb0a3e730f32db84c65868ca 8b32f88104ecf859be934d9b45f30cd1 e4a125e3163e4c1bd40060614c79bd53 8b32f88104ecf859be934d9b45f30cd1 5ef6718f4517d2d3c316fc45226f41dc e4a125e3163e4c1bd40060614c79bd53 041547bddb0a3e730f32db84c65868ca e7efab54028017e35a35d1b1637e210c 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 a97177f0f37a2bae91d8e67831949392 6027767fad949f3ca5e772df04924949 6027767fad949f3ca5e772df04924949 6027767fad949f3ca5e772df04924949' #20首歌
# input_20_songs = [song_to_int[song] for song in input_20_songs.split(' ')]
input_20_songs = tokenizer.encode(input_20_songs)
input_20_songs = torch.tensor(input_20_songs)
input_20_songs = input_20_songs.unsqueeze(0)
input_20_songs = input_20_songs.to(model.device)

output_5_songs = model.generate(input_ids=input_20_songs, max_length=500, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=5, early_stopping=True)
output_5_songs = output_5_songs.tolist()[0]
# output_5_songs = [int_to_song[song] for song in output_5_songs]
output_5_songs = tokenizer.decode(output_5_songs)
print(output_5_songs)

6027767fad949f3ca5e772df04924949 041547bddb0a3e730f32db84c65868ca 041547bddb0a3e730f32db84c65868ca 041547bddb0a3e730f32db84c65868ca 8b32f88104ecf859be934d9b45f30cd1 e4a125e3163e4c1bd40060614c79bd53 8b32f88104ecf859be934d9b45f30cd1 5ef6718f4517d2d3c316fc45226f41dc e4a125e3163e4c1bd40060614c79bd53 041547bddb0a3e730f32db84c65868ca e7efab54028017e35a35d1b1637e210c 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 3f8e8cbe4b5d55f07ba4c7ddfab624b7 a97177f0f37a2bae91d8e67831949392 6027767fad949f3ca5e772df04924949 6027767fad949f3ca5e772df04924949 6027767fad949f3ca5e772df049249494950606061 61a7e7f9e6e99f5f6f61a6c6b6d6a61b61 6a62a63a64a1a5a65a66a6767666667 66666565666867 6767686869676969687069707071 71a71a72a73b71b73a74a75a76a77b77


In [None]:
len(output_5_songs.split(" "))

25

#方法4 - 找規則劃分全部

##概念:
1. session_id常重複聽的歌為優先先排   
2. session_id最後一首歌若只聽過一次(在前面的20次以內)，我認為接下來聽的歌是來自系統推薦的歌單。所以找出其他session_id同樣只播過一次這首歌的紀錄(代表為系統推薦的，而不是session_id自己想重複聽的)，預測下一首系統推薦的歌

## 賦予變數

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

# 指定 Parquet 檔案的路徑
train_source_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_train_source.parquet'
train_target_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_train_target.parquet'
test_source_file_path = '/content/drive/MyDrive/Datagame-2023/data/label_test_source.parquet'
meta_song_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song.parquet'
song_composer_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_composer.parquet'
song_genre_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_genre.parquet'
song_lyricist_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_lyricist.parquet'
song_producer_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_producer.parquet'
song_titletext_file_path = '/content/drive/MyDrive/Datagame-2023/data/meta_song_titletext.parquet'

In [None]:
train_df = pd.read_parquet(train_source_file_path) #train的前20首
target_df = pd.read_parquet(train_target_file_path) #train的後5首
train_target = pd.concat([train_df, target_df], ignore_index=True) #train的25首
del train_df
del target_df

test_df = pd.read_parquet(test_source_file_path) #test的20首
train_target_test = pd.concat([train_target, test_df], ignore_index=True) #train的25首+test的20首

meta_song_df = pd.read_parquet(meta_song_file_path)
# 将 NaN 值替换为 0，并转换为整数
meta_song_df['album_id'] = meta_song_df['album_id'].fillna(0).astype(int)
meta_song_df['artist_id'] = meta_song_df['artist_id'].fillna(0).astype(int)

train_target_test_meta = pd.merge(train_target_test, meta_song_df[['song_id', 'artist_id', 'album_id']], on='song_id', how='left')
test_meta = pd.merge(test_df, meta_song_df[['song_id', 'artist_id', 'album_id']], on='song_id', how='left')

del meta_song_df

In [None]:
# song_id_counts: 在session_id內，每個song_id出現的次數寫在count這個column裡
song_id_counts = test_meta.groupby('session_id')['song_id'].value_counts()
song_id_counts = song_id_counts.reset_index(name='count')
test_meta = test_meta.merge(song_id_counts, how='left', on=['session_id', 'song_id']) #test_meta獲得count這個column

song_id_counts = train_target_test_meta.groupby('session_id')['song_id'].value_counts()
song_id_counts = song_id_counts.reset_index(name='count')
train_target_test_meta = train_target_test_meta.merge(song_id_counts, how='left', on=['session_id', 'song_id']) #train_target_test_meta獲得count這個column

In [None]:
# sorted_test_meta: 把test裡每個session_id重複播放的歌曲由高到低排下來，並且count數必須>1
sorted_test_meta = test_meta.sort_values(by=['session_id', 'count'], ascending=[True, False]) #由高到低排序每個session_id裡面的count
# 选择每个 session_id 不重複 song_id 的前 5 行，如果 count 大于 1，则保留
sorted_test_meta = sorted_test_meta.drop_duplicates(subset=['session_id', 'song_id']).groupby(['session_id']).head(5).loc[lambda x: x['count'] > 1]

test_session_count = sorted_test_meta['session_id'].value_counts() #session_id含有的歌曲數(一定<=5)
test_session_list = test_session_count.index.to_list() #把test的session_id做成list

submission_data = {'session_id': [], 'top1': [], 'top2': [], 'top3': [], 'top4': [], 'top5': []}

In [None]:
# submission_3 = pd.read_csv('/content/drive/MyDrive/Datagame-2023/test/submission.csv')
# not_included_test = test_df[~test_df['session_id'].isin(submission_3['session_id'])]
# not_included_test_list = not_included_test['session_id'].unique()

## 開始給 test 的每個 session_id 分配五首 song_id

In [None]:
'''
找出(含有相同artist或album)和(count為1)的位置，並取出聽這些歌的所有session_id，取出這些session_id聽的所有歌，定位含有最後一首歌的所有index，
然後從每個index開始+1(代表連續播放的歌)，若有相同的artist或album就加入清單，繼續+1往下找，沒有就換下一個index。
'''
def add_song_from_same_artist_or_album(artist_or_album):
  global empty_count #使用外部的empty_count變數
  print("1:",empty_count,'-------------------------------------------------')
  # 如果最後一首歌在清單裡的話就不用再跑了，直接給清單裡的歌曲即可
  if test_lastone_song in song_to_following_song: #
    for song in song_to_following_song[test_lastone_song]:
      each_session_s_song_list.append(song)
      empty_count -= 1
      if empty_count == 0:
        break
  print("2:",empty_count)
  if empty_count == 0:
    return
  if artist_or_album == 'artist':
    artist_or_album = "artist_id"
  else:
    artist_or_album = "album_id"
  test_lastone_song_s_artist = session_song_list.iloc[last_count_one_loc][artist_or_album]
  artist_result = train_target_test_meta[
      (train_target_test_meta[artist_or_album]==(test_lastone_song_s_artist)) &
      (train_target_test_meta['count'] == 1)
  ]
  artist_result = artist_result[artist_result['session_id'].isin(artist_result[artist_result['song_id']==test_lastone_song]['session_id'].to_list())]
  test_lastone_song_in_artist_result_index = artist_result.index[artist_result['song_id']==(test_lastone_song)]
  artist_result_index_list = artist_result.index.to_list()
  for i in test_lastone_song_in_artist_result_index:
    temp_loc = i+1
    while temp_loc in artist_result_index_list:
      if artist_result.loc[temp_loc]['song_id'] not in each_session_s_song_list:
        each_session_s_song_list.append(artist_result.loc[temp_loc]['song_id'])
        empty_count -= 1
        if empty_count == 0:
          break
      temp_loc += 1
    if empty_count == 0:
        break

In [None]:
# song_to_following_song: 最後一首歌對到接下來的五首歌，遇到跑過的最後一首歌就不用再跑一遍了
song_to_following_song = {}
# famous_songs: 排名由高到低的歌曲
famous_songs = train_target_test_meta['song_id'].value_counts()
famous_songs = famous_songs.index.to_list()

test_df_session_list = test_df['session_id'].unique()
submission = pd.DataFrame({'session_id':{} , 'top1': {}, 'top2': {}, 'top3': {}, 'top4': {}, 'top5': {}})

In [None]:
'''
如果這個session_id有空缺(empty_count!=0)就加入歌曲直到5首歌為止。
從session_id聽的最後一首開始找起，最後一首必須count為1，否則往前找。(我認為count為1才會是依靠系統推薦的歌曲，否則可能只是session_id自己喜歡聽的歌而已)
最後一首先找album，再找artist,再找famous_song，填補直到5首歌為止
'''
for ith, session_id in enumerate(test_df_session_list[:]):
  if session_id in test_session_count.index:
    empty_count = (5-test_session_count.loc[session_id])
  else:
    empty_count = 5
  each_session_s_song_list = sorted_test_meta[sorted_test_meta['session_id']==session_id]['song_id'].to_list()
  if empty_count != 0:
    last_count_one_loc = 20
    session_song_list = test_meta[test_df['session_id'] == session_id]
    i = 20
    while i>0:
      for i in range(last_count_one_loc-1, -1, -1):
        if session_song_list.iloc[i]['count'] == 1:
          last_count_one_loc = i
          break
      if last_count_one_loc == 20:
        break
      test_lastone_song = session_song_list.iloc[last_count_one_loc]["song_id"]
      test_lastone_song_s_album = session_song_list.iloc[last_count_one_loc]["album_id"]
      if test_lastone_song_s_album == 0: #0代表這首歌沒有對應的album
        add_song_from_same_artist_or_album("artist")
      else:
        add_song_from_same_artist_or_album("album")
        song_to_following_song[test_lastone_song] = each_session_s_song_list
        if empty_count == 0:
          break
        test_lastone_song_s_artist = session_song_list.iloc[last_count_one_loc]["artist_id"]
        if  test_lastone_song_s_artist != 0:
          add_song_from_same_artist_or_album("artist")
      song_to_following_song[test_lastone_song] = each_session_s_song_list
      # if test_lastone_song in song_to_following_song:
      #   for song in song_to_following_song[test_lastone_song]:
      #     each_session_s_song_list.append(song)
      #     empty_count -= 1
      #     if empty_count <= 0:
      #       break

      # if empty_count > 0:
      #   test_lastone_song_s_album = session_song_list.iloc[last_count_one_loc]["album_id"]
      #   for song_id in album_song_count.loc[test_lastone_song_s_album].index.get_level_values('song_id').tolist():
      #     if song_id not in each_session_s_song_list:
      #       each_session_s_song_list.append(song_id)
      #       empty_count -= 1
      #     if empty_count == 0:
      #       break
      if empty_count == 0:
        break
    while empty_count > 0:
      for song in (famous_songs):
        if song not in each_session_s_song_list:
          each_session_s_song_list.append(song)
          empty_count -= 1
        if empty_count == 0:
          break
  # submission_data['session_id'].append(session_id)
  # submission_data['top1'].append(each_session_s_song_list[0])
  # submission_data['top2'].append(each_session_s_song_list[1])
  # submission_data['top3'].append(each_session_s_song_list[2])
  # submission_data['top4'].append(each_session_s_song_list[3])
  # submission_data['top5'].append(each_session_s_song_list[4])
  new_row_data = {'session_id':session_id , 'top1': each_session_s_song_list[1], 'top2': each_session_s_song_list[2], 'top3': each_session_s_song_list[3], 'top4': each_session_s_song_list[4], 'top5': each_session_s_song_list[0]}
  new_row_df = pd.DataFrame([new_row_data])
  submission = pd.concat([submission, new_row_df], ignore_index=True)

  print("ith:", ith, "/", each_session_s_song_list)

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
2: 5
1: 5 -------------------------------------------------
2: 5
1: 5 -------------------------------------------------
2: 0
ith: 141589 / ['9d86e8ea4ee237d367bf8bb93323fae3', '95b14a765bbead300fc5f72c4e84563f', '2bf28572fe4ad26b065b3f9ba75e7cd5', 'dcdcb4508a13d7b667b40fe70fab914f', '45734e0b2998e7fa58fbd90088c1d4f2']
1: 4 -------------------------------------------------
2: 0
ith: 141590 / ['0aeab5535ef92caa360d8050c93ca6a3', 'af894be6f2cec95b9de30b0a128c83bd', 'f6e2cc63a61523660e397db2b55992fb', '6041c01127a16d6f7d9c626737725564', 'af894be6f2cec95b9de30b0a128c83bd']
ith: 141591 / ['8611f69900b00ba2417ac8fe5c47719d', '7d8733fb992fa81f648a47f7adf49aff', '6138f023aefdb0f2d4416a1897e8de35', '91a859d0601bd33ed394885de59e63fc', 'c678c08bbe49704e0c374068147efe74']
1: 5 -------------------------------------------------
2: 5
ith: 141592 / ['f81a55fe56976ea66e689a4eabacae1a', 'b85986ccfb199457059011c2f0e0f132', 'a809c9d2210f925f90ad99f659597856', 'c0affbbe6a0

In [None]:
submission['session_id'] = submission['session_id'].astype(int) #從float轉成int
submission.to_csv('/content/drive/MyDrive/Datagame-2023/test/submission.csv',index=False)