In [None]:
pip install tensorflow-recommenders

Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl.metadata (4.6 kB)
Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.7.3


In [None]:
# pip uninstall -y tensorflow

Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0


In [None]:
# pip install tensorflow=="2.15.0"

Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting numpy<2.0.0,>=1.23.5 (from tensorflow==2.15.0)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.15.0)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tenso

In [None]:
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [None]:
# import Google Drive 套件
from google.colab import drive
# 將自己的雲端硬碟掛載上去
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  1


In [None]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        tf.config.set_visible_devices(gpus[0], 'GPU')
        print('Using GPU')
    except RuntimeError as e:
        print(e)
else:
    print('No GPU found')


Using GPU


gdrive/My Drive/recommendation/

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.decomposition import PCA
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization

# ---------------------------------------------------------------------
# 1. 載入與準備資料
# ---------------------------------------------------------------------
# --- 載入資料 ---
df_project = pd.read_parquet("gdrive/My Drive/recommendation/recommend_project.parquet")
df_subscribe = pd.read_csv("gdrive/My Drive/recommendation/recommend_subscribe.csv")
df_member = pd.read_csv("gdrive/My Drive/recommendation/recommend_member.csv")
df_member_tag = pd.read_csv('gdrive/My Drive/recommendation/recommend_member_tag.csv')
df_project_sub_category = pd.read_csv("gdrive/My Drive/recommendation/recommend_project_sub_category.csv")
df_review = pd.read_csv('gdrive/My Drive/recommendation/recommend_review.csv')
df_follower = pd.read_csv('gdrive/My Drive/recommendation/recommend_follow.csv')
df_member_click_6_months = pd.read_csv("gdrive/My Drive/recommendation/recommend_click.csv")


# --- member features ---
df_member["gender"] = df_member["gender"].fillna("Unknown")
df_member["gender_encoded"] = df_member["gender"].astype("category").cat.codes
today = pd.Timestamp.today()
df_member['birthday'] = pd.to_datetime(df_member['birthday'], errors='coerce')
valid_bday = (df_member['birthday'] >= pd.Timestamp('1900-01-01')) & (df_member['birthday'] <= today)
df_member.loc[~valid_bday, 'birthday'] = pd.NaT
df_member['age'] = (today - df_member['birthday']).dt.days / 365.25
df_member['age'] = df_member['age'].fillna(df_member['age'].median())
df_member['age_bucket'] = pd.cut(df_member['age'], bins=[0, 18, 25, 35, 50, 100], labels=False, right=False)
df_member['age_bucket'] = df_member['age_bucket'].fillna(0).astype(int)
user_tags = df_member_tag.groupby('member_id')['type_id'].agg(list).reset_index()
df_member = df_member.merge(user_tags, on='member_id', how='left')
df_member.rename(columns={'type_id': 'user_tag_ids'}, inplace=True)
df_member['user_tag_ids'] = df_member['user_tag_ids'].apply(lambda x: x if isinstance(x, list) else ['0'])
df_member['user_tag_ids'] = df_member['user_tag_ids'].apply(lambda l: [str(x) for x in l])
count = df_member['user_tag_ids'].apply(lambda x: x == ['0']).sum()
count_before = df_member['user_tag_ids'].apply(lambda x: x == ['0']).sum()
print(f"修改前，有 {count_before} 位用戶的 user_tag_ids 是 ['0']")

# ---------------------------------------------------------------------
# --- 從用戶購買歷史推斷隱含興趣標籤 ---
# ---------------------------------------------------------------------
print("\n--- 開始從購買歷史推斷用戶興趣標籤 ---")
item_sub_cate = df_project_sub_category.groupby('project_id')['sub_category'].agg(list).reset_index()
df_project = df_project.merge(item_sub_cate, on='project_id', how='left')
df_project.rename(columns={'sub_category': 'sub_categories'}, inplace=True)
df_project['sub_categories'] = df_project['sub_categories'].apply(lambda x: x if isinstance(x, list) else [])

df_project_for_tags = df_project[['project_id', 'sub_categories']]
df_subscribe_with_cate = pd.merge(df_subscribe, df_project_for_tags, on='project_id', how='left')

def flatten_and_unique(series_of_lists):
    all_items = [item for sublist in series_of_lists.dropna() for item in sublist]
    return list(set(all_items))

inferred_tags = df_subscribe_with_cate.groupby('member_id')['sub_categories'].apply(flatten_and_unique).reset_index()
inferred_tags.rename(columns={'sub_categories': 'inferred_user_tags'}, inplace=True)

df_member = pd.merge(df_member, inferred_tags, on='member_id', how='left')
df_member['inferred_user_tags'] = df_member['inferred_user_tags'].apply(lambda x: x if isinstance(x, list) else [])

def update_tags(row):
    is_default_tag = (row['user_tag_ids'] == ['0'])
    has_inferred_tags = bool(row['inferred_user_tags'])

    if is_default_tag:
        if has_inferred_tags:
            return row['inferred_user_tags']
        else:
            return row['user_tag_ids']
    else:
        if has_inferred_tags:
            return list(set(row['inferred_user_tags'] + row['user_tag_ids']))
        else:
            return row['user_tag_ids']

df_member['user_tag_ids'] = df_member.apply(update_tags, axis=1)
df_member['user_tag_ids'] = df_member['user_tag_ids'].apply(lambda l: [str(x) for x in l])
count_after = df_member['user_tag_ids'].apply(lambda x: x == ['0']).sum()
print(f"修改後，剩下 {count_after} 位用戶的 user_tag_ids 是 ['0']")
print("--- 用戶興趣標籤推斷完成 ---")

# --- project features ---
df_project = pd.read_parquet("gdrive/My Drive/recommendation/recommend_project.parquet")
embedding_matrix = df_project['content_embedding'].apply(pd.Series).values
# Apply PCA to reduce dimensions to 768
pca = PCA(n_components=256)
reduced_embeddings = pca.fit_transform(embedding_matrix)

# Add reduced embeddings back to the DataFrame if needed
df_project['content_embedding'] = reduced_embeddings.tolist()
df_project['content_embedding'] = df_project['content_embedding'].apply(lambda x: np.asarray(x, dtype=np.float32))
item_sub_cate = df_project_sub_category.groupby('project_id')['sub_category'].agg(list).reset_index()
df_project = df_project.merge(item_sub_cate, on='project_id', how='left')
df_project.rename(columns={'sub_category': 'sub_categories'}, inplace=True)
df_project['sub_categories'] = df_project['sub_categories'].apply(lambda x: x if isinstance(x, list) else [])
df_project['start_date'] = pd.to_datetime(df_project['start_date'])
df_project['launch_age'] = (today - df_project['start_date']).dt.days
df_project['launch_age'] = df_project['launch_age'].fillna(df_project['launch_age'].median())
review_count = df_review.groupby('project_id').size().rename('num_reviews')
follower_count = df_follower.groupby('project_id').size().rename('total_followers')
df_project = df_project.merge(review_count, on='project_id', how='left')
df_project = df_project.merge(follower_count, on='project_id', how='left')
stat_cols = ['purchase_cnt', 'rating', 'num_reviews', 'total_followers','launch_age']
df_project[stat_cols] = df_project[stat_cols].fillna(0)
for col in stat_cols:
    df_project[col] = pd.to_numeric(df_project[col], errors='coerce')
    df_project[col] = df_project[col].fillna(0)
    df_project['log_' + col] = np.log1p(df_project[col])
df_project = df_project[["project_id", "content_embedding","main_category","log_launch_age","log_purchase_cnt","log_rating","log_num_reviews","log_total_followers","sub_categories"]]

# ---------------------------------------------------------------------
# 2. 處理點擊歷史與冷啟動特徵
# ---------------------------------------------------------------------
print("\n--- 處理點擊歷史與冷啟動特徵 ---")

df_member_click_6_months['record_date'] = pd.to_datetime(df_member_click_6_months['record_date'])
df_subscribe['subscribe_date'] = pd.to_datetime(df_subscribe['subscribe_date'])

purchased_projects = df_subscribe.groupby('member_id')['project_id'].apply(set).to_dict()
def filter_purchased_clicks(row):
    member_id = row['member_id']
    project_id = row['project_id']
    if member_id in purchased_projects:
        return project_id not in purchased_projects[member_id]
    return True
df_click_filtered = df_member_click_6_months[df_member_click_6_months.apply(filter_purchased_clicks, axis=1)].copy()


df_click_filtered = df_click_filtered.sort_values(['member_id', 'record_date'])
MAX_SEQ_LENGTH = 50
user_click_sequences = df_click_filtered.groupby('member_id').apply(
    lambda g: g.sort_values('record_date').tail(MAX_SEQ_LENGTH)['project_id'].tolist()
).to_dict()


df_member['click_sequence'] = df_member['member_id'].map(user_click_sequences)
df_member['click_sequence'] = df_member['click_sequence'].apply(lambda x: x if isinstance(x, list) else [])

df_member['num_clicks'] = df_member['click_sequence'].apply(len)
df_member['is_cold_user'] = (df_member['num_clicks'] == 0).astype(np.float32)

print(f"總用戶數: {len(df_member)}")
print(f"平均點擊次數: {df_member['num_clicks'].mean():.2f}")
print(f"冷啟動用戶數 (is_cold_user=1): {(df_member['is_cold_user'] == 1).sum()}")
print(f"暖啟動用戶數 (is_cold_user=0): {(df_member['is_cold_user'] == 0).sum()}")


# --- 建立互動資料表 ---
df_subscribe = pd.merge(df_member, df_subscribe, on='member_id', how='right')
df_subscribe = pd.merge(df_project, df_subscribe, on='project_id')

df_subscribe = df_subscribe[[
    'member_id', 'project_id', 'gender_encoded', 'main_category', 'content_embedding',
    'age_bucket', 'sub_categories', 'user_tag_ids', 'click_sequence',
    'is_cold_user',
    'log_launch_age', 'log_purchase_cnt', 'log_rating', 'log_num_reviews', 'log_total_followers',
    'subscribe_date'
]]

df_subscribe.drop_duplicates(subset=["member_id", "project_id"], inplace=True)
df_subscribe = df_subscribe.sort_values("subscribe_date").reset_index(drop=True)

print(f"最終互動資料表 shape: {df_subscribe.shape}")
print(f"互動資料表欄位: {df_subscribe.columns.tolist()}")

# ---------------------------------------------------------------------
# 3. 建立 tf.data 資料集
# ---------------------------------------------------------------------
def pad_sequence(seq, max_length=MAX_SEQ_LENGTH, pad_value="0"):
    if len(seq) >= max_length:
        return seq[-max_length:]
    else:
        return [pad_value] * (max_length - len(seq)) + seq

padded_click_sequences = [pad_sequence(seq) for seq in df_subscribe["click_sequence"]]

interactions_dict = {
    "member_id"        : df_subscribe["member_id"].astype(str).values,
    "gender_encoded"   : df_subscribe["gender_encoded"].astype(str).values,
    "project_id"       : df_subscribe["project_id"].astype(str).values,
    "main_category"    : df_subscribe["main_category"].astype(str).values,
    "content_embedding": np.stack(df_subscribe["content_embedding"].values),
    "age_bucket"       : df_subscribe["age_bucket"].astype(str).values,
    "sub_categories"   : tf.ragged.constant(df_subscribe["sub_categories"].to_list()),
    "user_tag_ids"     : tf.ragged.constant(df_subscribe["user_tag_ids"].to_list()),
    "click_sequence"   : tf.constant(padded_click_sequences, dtype=tf.string),
    "is_cold_user"     : df_subscribe["is_cold_user"].values,
    "log_launch_age"   : df_subscribe["log_launch_age"].astype(float).values,
    "log_purchase_cnt" : df_subscribe["log_purchase_cnt"].astype(float).values,
    "log_rating"       : df_subscribe["log_rating"].astype(float).values,
    "log_num_reviews"  : df_subscribe["log_num_reviews"].astype(float).values,
    "log_total_followers": df_subscribe["log_total_followers"].astype(float).values,
}

interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

# Project 資料集
projects_dict = {
    "project_id"       : df_project["project_id"].astype(str).values,
    "main_category"    : df_project["main_category"].astype(str).values,
    "content_embedding": np.stack(df_project["content_embedding"].values),
    "sub_categories": tf.ragged.constant(df_project["sub_categories"].to_list()),
    "log_launch_age"   : df_project["log_launch_age"].astype(float).values,
    "log_purchase_cnt" : df_project["log_purchase_cnt"].astype(float).values,
    "log_rating"       : df_project["log_rating"].astype(float).values,
    "log_num_reviews"  : df_project["log_num_reviews"].astype(float).values,
    "log_total_followers": df_project["log_total_followers"].astype(float).values,
}
projects = tf.data.Dataset.from_tensor_slices(projects_dict)

# ---------------------------------------------------------------------
# 4. 建立詞彙表
# ---------------------------------------------------------------------
unique_member_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["member_id"]))))
unique_genders = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["gender_encoded"]))))
unique_project_ids = np.unique(np.concatenate(list(projects.batch(1_000).map(lambda x: x["project_id"]))))
unique_main_categories = np.unique(np.concatenate(list(projects.batch(1_000).map(lambda x: x["main_category"]))))
unique_age_buckets = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["age_bucket"]))))

all_click_projects = set()
for seq in df_subscribe["click_sequence"]:
    all_click_projects.update(seq)
project_ids_as_strings = [id.decode('utf-8') if isinstance(id, bytes) else str(id) for id in unique_project_ids.tolist()]
all_click_projects.update(project_ids_as_strings)
all_click_projects.discard('')
unique_sequential_project_ids = sorted([str(id) for id in all_click_projects])

ragged_sub_categories = tf.ragged.constant(df_project["sub_categories"].to_list())
flat_sub_cats = ragged_sub_categories.flat_values
unique_sub_categories = np.unique(flat_sub_cats.numpy())
ragged_user_tag_ids = tf.ragged.constant(df_subscribe["user_tag_ids"].to_list())
flat_user_tag_ids = ragged_user_tag_ids.flat_values
unique_user_tag_ids = np.unique(flat_user_tag_ids.numpy())

EMBEDDING_DIM = 32
SEQ_EMBEDDING_DIM = 64
NUMERIC_FEATURES = ["log_launch_age", "log_purchase_cnt", "log_rating", "log_num_reviews", "log_total_followers"]

# ---------------------------------------------------------------------
# 5. SASRec Encoder
# ---------------------------------------------------------------------
class SASRecEncoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, max_seq_length,
                 num_heads=4, num_blocks=2, dropout_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.max_seq_length = max_seq_length
        self.num_heads = num_heads
        self.num_blocks = num_blocks
        self.dropout_rate = dropout_rate

        self.item_lookup = tf.keras.layers.StringLookup(
            vocabulary=unique_sequential_project_ids, mask_token="0", name="sasrec_item_lookup")
        self.item_embedding = tf.keras.layers.Embedding(
            self.item_lookup.vocabulary_size(), embedding_dim, mask_zero=True)
        self.positional_embedding = tf.keras.layers.Embedding(
            max_seq_length, embedding_dim)

        self.attention_blocks = []
        self.feed_forward_blocks = []
        self.layer_norms_1 = []
        self.layer_norms_2 = []
        self.dropouts = []

        for _ in range(num_blocks):
            self.attention_blocks.append(
                MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim//num_heads)
            )
            self.feed_forward_blocks.append(
                tf.keras.Sequential([
                    tf.keras.layers.Dense(embedding_dim * 4, activation='relu'),
                    tf.keras.layers.Dense(embedding_dim)
                ])
            )
            self.layer_norms_1.append(LayerNormalization())
            self.layer_norms_2.append(LayerNormalization())
            self.dropouts.append(tf.keras.layers.Dropout(dropout_rate))

    def call(self, inputs, training=None):
        seq_length = tf.shape(inputs)[1]
        item_ids = self.item_lookup(inputs)
        item_emb = self.item_embedding(item_ids)
        positions = tf.range(seq_length)
        pos_emb = self.positional_embedding(positions)
        pos_emb = tf.expand_dims(pos_emb, 0)
        sequence_emb = item_emb + pos_emb

        causal_mask = tf.linalg.band_part(tf.ones((seq_length, seq_length)), -1, 0)
        causal_mask = tf.expand_dims(causal_mask, 0)

        for i in range(self.num_blocks):
            attn_output = self.attention_blocks[i](
                sequence_emb, sequence_emb, attention_mask=causal_mask, training=training)
            attn_output = self.dropouts[i](attn_output, training=training)
            sequence_emb = self.layer_norms_1[i](sequence_emb + attn_output)

            ff_output = self.feed_forward_blocks[i](sequence_emb)
            ff_output = self.dropouts[i](ff_output, training=training)
            sequence_emb = self.layer_norms_2[i](sequence_emb + ff_output)

        mask = self.item_embedding.compute_mask(item_ids)
        if mask is not None:
            mask = tf.cast(mask, tf.float32)
            mask = tf.expand_dims(mask, -1)
            sequence_emb = sequence_emb * mask
            pooled = tf.reduce_sum(sequence_emb, axis=1) / (tf.reduce_sum(mask, axis=1) + 1e-9)
        else:
            pooled = tf.reduce_mean(sequence_emb, axis=1)

        return pooled

# ---------------------------------------------------------------------
# 6. User Model (動靜態雙模式與門控)
# ---------------------------------------------------------------------
class StaticUserModel(tf.keras.Model):
    """僅處理靜態特徵的用戶模型，用於冷啟動用戶。"""
    def __init__(self, dynamic_model_output_dim):
        super().__init__()
        self.user_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_member_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_member_ids) + 1, EMBEDDING_DIM)])
        self.gender_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_genders, mask_token=None),
            tf.keras.layers.Embedding(len(unique_genders) + 1, EMBEDDING_DIM)])
        self.age_bucket_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_age_buckets, mask_token=None),
            tf.keras.layers.Embedding(len(unique_age_buckets) + 1, EMBEDDING_DIM)])
        self.user_tag_ids_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_tag_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_tag_ids) + 1, EMBEDDING_DIM),
            tf.keras.layers.GlobalAveragePooling1D()])
        # 投影層：將靜態特徵的維度投影到與動態模型輸出維度一致
        self.projection = tf.keras.layers.Dense(dynamic_model_output_dim, activation='relu', name='static_projection')

    def call(self, inputs):
        static_features = tf.concat([
            self.user_emb(inputs["member_id"]),
            self.gender_emb(inputs["gender_encoded"]),
            self.age_bucket_emb(inputs["age_bucket"]),
            self.user_tag_ids_emb(inputs["user_tag_ids"]),
        ], axis=1)
        return self.projection(static_features)

class DynamicUserModel(tf.keras.Model):
    """處理靜態+序列特徵的用戶模型，用於暖啟動用戶。"""
    def __init__(self):
        super().__init__()
        # 靜態特徵層
        self.user_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_member_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_member_ids) + 1, EMBEDDING_DIM)])
        self.gender_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_genders, mask_token=None),
            tf.keras.layers.Embedding(len(unique_genders) + 1, EMBEDDING_DIM)])
        self.age_bucket_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_age_buckets, mask_token=None),
            tf.keras.layers.Embedding(len(unique_age_buckets) + 1, EMBEDDING_DIM)])
        self.user_tag_ids_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_user_tag_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_tag_ids) + 1, EMBEDDING_DIM),
            tf.keras.layers.GlobalAveragePooling1D()])
        # 序列特徵層
        self.sequential_encoder = SASRecEncoder(
            vocab_size=len(unique_sequential_project_ids),
            embedding_dim=SEQ_EMBEDDING_DIM,
            max_seq_length=MAX_SEQ_LENGTH)

    def call(self, inputs):
        static_features = tf.concat([
            self.user_emb(inputs["member_id"]),
            self.gender_emb(inputs["gender_encoded"]),
            self.age_bucket_emb(inputs["age_bucket"]),
            self.user_tag_ids_emb(inputs["user_tag_ids"]),
        ], axis=1)
        sequential_features = self.sequential_encoder(inputs["click_sequence"])
        return tf.concat([static_features, sequential_features], axis=1)

    @property
    def output_dim(self):
        return (4 * EMBEDDING_DIM) + SEQ_EMBEDDING_DIM

class GatedUserModel(tf.keras.Model):
    """門控用戶模型，根據 is_cold_user 在靜態和動態模型間路由。"""
    def __init__(self):
        super().__init__()
        self.dynamic_model = DynamicUserModel()
        self.static_model = StaticUserModel(self.dynamic_model.output_dim)

    def call(self, inputs):
        is_cold_mask = tf.cast(inputs["is_cold_user"], tf.bool)

        # BUG FIX: Reshape to (batch_size, 1) to enable broadcasting for tf.where
        is_cold_mask_reshaped = tf.reshape(is_cold_mask, [-1, 1])

        static_embedding = self.static_model(inputs)
        dynamic_embedding = self.dynamic_model(inputs)

        # 核心路由邏輯：
        # 當 is_cold_mask_reshaped 的元素為 True 時，選用 static_embedding
        # 反之，則選用 dynamic_embedding
        final_embedding = tf.where(
            is_cold_mask_reshaped,
            x=static_embedding,
            y=dynamic_embedding
        )
        return final_embedding

# ---------------------------------------------------------------------
# 7. Project Model
# ---------------------------------------------------------------------
class ProjectModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.id_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_project_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_project_ids) + 1, EMBEDDING_DIM),
        ])
        self.main_category_emb = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_main_categories, mask_token=None),
            tf.keras.layers.Embedding(len(unique_main_categories) + 1,EMBEDDING_DIM),
        ])
        self.content_proj = tf.keras.layers.Dense(EMBEDDING_DIM, use_bias=False)
        self.sub_category_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_sub_categories, mask_token=None),
            tf.keras.layers.Embedding(len(unique_sub_categories) + 1, EMBEDDING_DIM),
            tf.keras.layers.GlobalAveragePooling1D()
        ])
        self.numeric_proj = tf.keras.layers.Dense(
            EMBEDDING_DIM, activation="relu"
        )
    def call(self, inputs):
        numeric = tf.stack(
            [tf.cast(inputs[f], tf.float32) for f in NUMERIC_FEATURES],
            axis=-1
        )
        return tf.concat([
            self.id_emb(inputs["project_id"]),
            self.main_category_emb(inputs["main_category"]),
            self.content_proj(inputs["content_embedding"]),
            self.sub_category_embedding(inputs["sub_categories"]),
            self.numeric_proj(numeric),
        ], axis=1)

# ---------------------------------------------------------------------
# 8. 雙塔模型
# ---------------------------------------------------------------------
class SequentialTwoTower(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        l2_regularizer = tf.keras.regularizers.l2(1e-5)

        self.query_model = tf.keras.Sequential([
            GatedUserModel(),
            tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=l2_regularizer),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=l2_regularizer),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(EMBEDDING_DIM)
        ])

        self.candidate_model = tf.keras.Sequential([
            ProjectModel(),
            tf.keras.layers.Dense(256, activation="relu", kernel_regularizer=l2_regularizer),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(128, activation="relu", kernel_regularizer=l2_regularizer),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(EMBEDDING_DIM)
        ])

        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=projects.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        query_emb = self.query_model({
            "member_id"      : features["member_id"],
            "gender_encoded" : features["gender_encoded"],
            "age_bucket"     : features["age_bucket"],
            "user_tag_ids"   : features["user_tag_ids"],
            "click_sequence" : features["click_sequence"],
            "is_cold_user"   : features["is_cold_user"],
        })

        candidate_emb = self.candidate_model({
            "project_id"       : features["project_id"],
            "main_category"    : features["main_category"],
            "content_embedding": features["content_embedding"],
            "sub_categories"   : features["sub_categories"],
            "log_launch_age"    : features["log_launch_age"],
            "log_purchase_cnt"    : features["log_purchase_cnt"],
            "log_rating"          : features["log_rating"],
            "log_num_reviews"     : features["log_num_reviews"],
            "log_total_followers" : features["log_total_followers"],
        })
        return self.task(query_emb, candidate_emb, compute_metrics=not training)


# ---------------------------------------------------------------------
# 9. 訓練與評估
# ---------------------------------------------------------------------
tf.random.set_seed(42)

dataset_size = len(df_subscribe)
train_size   = int(dataset_size * 0.8)
val_size     = int(dataset_size * 0.1)

train_ds      = interactions.take(train_size)
validation_ds = interactions.skip(train_size).take(val_size)
test_ds       = interactions.skip(train_size + val_size)

cached_train = (
    train_ds.shuffle(train_size, seed=42)
            .batch(2048)
            .cache()
            .prefetch(tf.data.AUTOTUNE)
)
cached_validation  = (
    validation_ds.batch(2048)
                 .cache()
                 .prefetch(tf.data.AUTOTUNE)
)
cached_test = (
    test_ds.batch(2048)
           .cache()
           .prefetch(tf.data.AUTOTUNE)
)


model = SequentialTwoTower()

early_stopping = EarlyStopping(
    monitor='val_factorized_top_k/top_100_categorical_accuracy',
    mode='max',
    patience=5,
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3
)

model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=0.005))
epochs = 50

print("\n--- Training ---")
history = model.fit(
    cached_train,
    epochs=epochs,
    validation_data=cached_validation,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

print("\n--- Evaluation ---")
train_metrics = model.evaluate(cached_train, return_dict=True)
validation_metrics = model.evaluate(cached_validation, return_dict=True)
test_metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nTop-100 Accuracy (Train):      {train_metrics['factorized_top_k/top_100_categorical_accuracy']:.4f}")
print(f"Top-100 Accuracy (Validation): {validation_metrics['factorized_top_k/top_100_categorical_accuracy']:.4f}")
print(f"Top-100 Accuracy (Test):       {test_metrics['factorized_top_k/top_100_categorical_accuracy']:.4f}")

print("\n--- Retrieval Model Saved ---")

修改前，有 939612 位用戶的 user_tag_ids 是 ['0']

--- 開始從購買歷史推斷用戶興趣標籤 ---
修改後，剩下 774492 位用戶的 user_tag_ids 是 ['0']
--- 用戶興趣標籤推斷完成 ---


  result = getattr(ufunc, method)(*inputs, **kwargs)



--- 處理點擊歷史與冷啟動特徵 ---


  user_click_sequences = df_click_filtered.groupby('member_id').apply(


總用戶數: 1363150
平均點擊次數: 0.35
冷啟動用戶數 (is_cold_user=1): 1249239
暖啟動用戶數 (is_cold_user=0): 113911
最終互動資料表 shape: (704377, 16)
互動資料表欄位: ['member_id', 'project_id', 'gender_encoded', 'main_category', 'content_embedding', 'age_bucket', 'sub_categories', 'user_tag_ids', 'click_sequence', 'is_cold_user', 'log_launch_age', 'log_purchase_cnt', 'log_rating', 'log_num_reviews', 'log_total_followers', 'subscribe_date']
