In [2]:
pip install numpy pandas tensorflow matplotlib scikit-learn transformers torch


Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Collecting typing-extensions<4.6.0,>=3.6.6 (from tensorflow-macos==2.13.0->tensorflow)
  Using cached typing_extensions-4.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.5-py3-none-any.whl.metadata (13 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.2.tar.gz (130 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp38-cp38-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp38-cp38-mac

In [6]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp38-cp38-macosx_11_0_arm64.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.8/906.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 하이퍼파라미터 설정
EMBEDDING_DIM = 384  # MiniLM 임베딩 차원
HIDDEN_DIM = 50     # 은닉층 차원
TEST_SIZE = 0.2
EPOCHS = 500
BATCH_SIZE = 64
REG = 0.0005
LEARNING_RATE = 0.0001

In [9]:
# MIND 데이터셋 로드
news_df = pd.read_csv("data/news.tsv", sep="\t", header=None,
                     names=['newsid', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])
behaviors_df = pd.read_csv("data/behaviors.tsv", sep="\t", header=None,
                         names=['impression_id', 'user_id', 'time', 'history', 'impressions'])


In [10]:
news_df.head()

Unnamed: 0,newsid,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [None]:
# 뉴스 임베딩을 위한 사전 학습 모델 로드
tokenizer = AutoTokenizer.from_pretrained("microsoft/Multilingual-MiniLM-L12-H384")
model = AutoModel.from_pretrained("microsoft/Multilingual-MiniLM-L12-H384")

In [None]:
# 뉴스 제목 임베딩 생성
titles = news_df['title'].tolist()
with torch.no_grad():
    inputs = tokenizer(titles, padding=True, truncation=True, return_tensors="pt", max_length=128)
    outputs = model(**inputs)
    title_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] 토큰 임베딩

print("뉴스 제목 임베딩 shape:", title_embeddings.shape)

In [1]:
# 카테고리와 타입 원-핫 인코딩
categories = pd.get_dummies(news_df['category'])
types = pd.get_dummies(news_df['subcategory'])

print("카테고리 특성 수:", len(categories.columns))
print("타입 특성 수:", len(types.columns))

# 입력 특성 결합
features = np.concatenate([title_embeddings, categories, types], axis=1)
print("최종 입력 특성 shape:", features.shape)

# 학습/테스트 데이터 분할
train_features, test_features = train_test_split(features, test_size=TEST_SIZE)

# 신경망 모델 구축
input_dim = features.shape[1]

inputs = keras.Input(shape=(input_dim,))
x = keras.layers.Dense(384, activation='relu')(inputs)
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.Dense(HIDDEN_DIM, activation='tanh')(x)
x = keras.layers.LayerNormalization()(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(64)(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs, outputs)

# 모델 구조 시각화
keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

# 모델 컴파일
model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=LEARNING_RATE),
             loss='binary_crossentropy',
             metrics=['accuracy'])

# 클릭 데이터 생성 (예시)
clicks = np.random.randint(0, 2, size=len(features))
train_clicks, test_clicks = train_test_split(clicks, test_size=TEST_SIZE)

# 모델 학습
history = model.fit(
    train_features, train_clicks,
    validation_data=(test_features, test_clicks),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    ]
)

# 학습 과정 시각화
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Embedding 공간 시각화 (t-SNE)
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
embeddings_2d = tsne.fit_transform(features)

plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=clicks, cmap='viridis', alpha=0.5)
plt.colorbar(label='Click/No Click')
plt.title('t-SNE Visualization of News Embeddings')
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.show()

# 예측 및 평가
predictions = model.predict(test_features)
print("\n최종 테스트 정확도:", model.evaluate(test_features, test_clicks)[1])

# 상위 추천 뉴스 예시
sample_user_idx = 0
sample_predictions = model.predict(features)
top_news_idx = np.argsort(sample_predictions.flatten())[-5:][::-1]

print("\n상위 5개 추천 뉴스:")
for idx in top_news_idx:
    print(f"제목: {news_df.iloc[idx]['title']}")
    print(f"카테고리: {news_df.iloc[idx]['category']}")
    print(f"예측 클릭 확률: {sample_predictions[idx][0]:.4f}")
    print()

ModuleNotFoundError: No module named 'scipy._lib'