In [1]:
from sentence_transformers import SentenceTransformer
from config.common import cfg
import os
from os.path import join
import polars as pl
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
news = pl.read_parquet(join(cfg['data_path'], 'news.parquet'))

In [6]:
train=pl.read_parquet(join(cfg['train_data_path'], 'behaviors.parquet'))
dev=pl.read_parquet(join(cfg['dev_data_path'], 'behaviors.parquet'))

In [7]:
news.columns

['news_id',
 'category',
 'subcategory',
 'title',
 'abstract',
 'url',
 'title_entities',
 'abstract_entities']

In [8]:
userID_mapping = {}
itemID_mapping = {}

for line in train.iter_rows(named=True):
    userID = line['user_id']
    history = line['history']
    for itemID in history:
        if userID not in userID_mapping:
            userID_mapping[userID] = len(userID_mapping) + 1

        if itemID not in itemID_mapping:
            itemID_mapping[itemID] = len(itemID_mapping) + 1
for line in dev.iter_rows(named=True):
    userID = line['user_id']
    history = line['history']
    for itemID in history:
        if userID not in userID_mapping:
            userID_mapping[userID] = len(userID_mapping) + 1

        if itemID not in itemID_mapping:
            itemID_mapping[itemID] = len(itemID_mapping) + 1
for itemID in news.iter_rows(named=True):
    itemID=itemID['news_id']
    if itemID not in itemID_mapping:
            itemID_mapping[itemID] = len(itemID_mapping) + 1

np.save(cfg['user_dict'], userID_mapping)
print("user_num:", len(userID_mapping))
print("the first five userID mapping:", list(userID_mapping.items())[:5])
np.save(cfg['item_dict'], itemID_mapping)
print("item_num:", len(itemID_mapping))
print("the first five itemID mapping:", list(itemID_mapping.items())[:5])


user_num: 94057
the first five userID mapping: [('U13740', 1), ('U91836', 2), ('U73700', 3), ('U34670', 4), ('U8125', 5)]
item_num: 65239
the first five itemID mapping: [('N55189', 1), ('N42782', 2), ('N34694', 3), ('N45794', 4), ('N18445', 5)]


In [12]:
data = pl.read_parquet(os.path.join(cfg['dev_data_path'], "behaviors_1.parquet"))

data_sorted = data.sort(["user_id", "time"])

# 对每个用户取最后一条记录
data = data_sorted.group_by("user_id").tail(1)

data.shape

ColumnNotFoundError: unable to find column "time"; valid columns: ["user_id", "history", "target"]

In [14]:
data.head()

user_id,history,target
str,list[i64],list[i64]
"""U78223""","[4570, 350, … 16034]",[16034]
"""U5885""","[1965, 5607, … 58593]",[58593]
"""U78886""","[1824, 1187, … 49992]",[49992]
"""U35301""","[1065, 4512, … 60644]",[60644]
"""U32668""","[5809, 3580, … 48776]",[48776]


In [15]:
def prepare_data(data_dict):
    "数据保存polar形式"
    rows = []
    for userID, item_sequenec in data_dict.items():
        history = item_sequenec[:-1]
        target = item_sequenec[-1]
        rows.append({'user_id': userID, 'history': history, 'target': target})
    return pl.DataFrame(rows)


"划分训练集，测试集，验证集"
train_data = {}
val_data = {}
test_data = {}

for row in data.iter_rows(named=True):
    userID = row["user_id"]
    item_sequence = [itemID_mapping[t] for t in  row["history"]]
    if len(item_sequence) > 2:
        train_data[userID] = item_sequence[:-2]
        val_data[userID] = item_sequence[:-1]
        test_data[userID] = item_sequence

print("training data:", list(train_data.items())[:5])
print("validation data:", list(val_data.items())[:5])
print("testing data:", list(test_data.items())[:5])

train_df = prepare_data(train_data)
print("\nTraining data shape:", train_df.shape)
print("the first 3 rows of training data:\n", train_df.head(3))
test_df = prepare_data(test_data)
print("\nTesting data shape:", test_df.shape)
print("the first 3 rows of testing data:\n", test_df.head(3))
val_df = prepare_data(val_data)
print("\nValidation data shape:", val_df.shape)
print("the first 3 rows of validation data:\n", val_df.head(3))

test_df.shape

KeyError: 4570

In [11]:
print("Data saved to parquet files.")
train_df.write_parquet(os.path.join(cfg['dev_data_path'], "train_df.parquet"))
test_df.write_parquet(os.path.join(cfg['dev_data_path'], "test_df.parquet"))
val_df.write_parquet(os.path.join(cfg['dev_data_path'], "valid_df.parquet"))

Data saved to parquet files.


In [7]:
model_path = cfg['emb_model_path']

# 加载本地模型
model = SentenceTransformer(model_path, device='cuda')

Loading weights:   0%|          | 0/99 [00:00<?, ?it/s]

In [8]:
item_embeddings = []
for row in tqdm(news.iter_rows(named=True),total=news.shape[0]):
    itemID = itemID_mapping[row['news_id']]

    semantics = f"Title: {row.get('title','')}\n"

    embedding = model.encode(semantics)
    item_embeddings.append({'ItemID': itemID, 'embedding': embedding.tolist()})

# Convert to DataFrame
item_emb_df = pd.DataFrame(item_embeddings)

print("\nItem embeddings DataFrame shape:", item_emb_df.shape)
print("The first 3 rows of item embeddings DataFrame:\n", item_emb_df.head(3))

# Save to parquet file
item_emb_df.to_parquet(join(cfg['embed_path'],'item_emb_title.parquet'), index=False)

  0%|          | 0/65238 [00:00<?, ?it/s]

KeyboardInterrupt: 