In [78]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAI, ChatOpenAI
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA

from collections import defaultdict

import time
import os
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as Data
import math
import requests
import json
import pickle


In [79]:
from dotenv import load_dotenv

load_dotenv()

True

In [80]:
project_path = os.path.abspath(os.getcwd())
data_dir_nm = 'data'
model_dir_nm = 'model'
data_path = f"{project_path}/{data_dir_nm}"
model_path = f"{project_path}/{model_dir_nm}"

# Load data
- MovieLens1M movie info
- MovieLens test set
- LabelEncoder
- model

In [81]:
with open('./data/movielens1m_label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

In [85]:
movie_info = pd.read_csv(f"{data_path}/movies.csv", dtype=str)
test_df = pd.read_csv(f"{data_path}/movielens1m_test.csv", dtype=str)
movielens_rcmm_origin = pd.read_csv(f"{data_path}/movielens_rcmm.csv", dtype=str)

In [86]:
movie_info.head()

Unnamed: 0,movie_id,title,movie_decade,genre
0,1,Toy Story,1990s,Animation
1,2,Jumanji,1990s,Adventure
2,3,Grumpier Old Men,1990s,Comedy
3,4,Waiting to Exhale,1990s,Comedy
4,5,Father of the Bride Part II,1990s,Comedy


In [87]:
test_df.head()

Unnamed: 0,user_id,movie_id,movie_decade,movie_year,rating_year,rating_month,rating_decade,genre1,genre2,genre3,gender,age,occupation,zip,label
0,2741,957,7,65,0,7,0,4,6,15,1,2,6,3078,1.0
1,4931,609,8,70,0,5,0,0,14,15,1,2,3,1918,1.0
2,5786,3143,8,73,0,11,0,4,17,15,1,1,15,3397,0.0
3,5917,1741,8,78,0,10,0,4,17,15,1,4,13,417,0.0
4,1339,1009,6,52,0,10,0,0,0,15,1,4,4,1800,1.0


# Set dataset

In [16]:
class MVLensDataset(Dataset):
    def __init__(self, data, u_i_cols, label_col):
        self.n = data.shape[0]
        self.y = data[label_col].astype(np.float32).values.reshape(-1, 1)

        self.u_i_cols = u_i_cols
        
        self.data_v = data[self.u_i_cols].astype(np.int64).values

        self.field_dims = np.max(self.data_v, axis=0) + 1


    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return [self.data_v[idx], self.y[idx]]
        
u_i_feature = ['user_id', 'movie_id']
label = 'label'
batch_size = 512
test_dataset = MVLensDataset(data=test_df, u_i_cols=u_i_feature, label_col=label)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set Model (NCF)

In [10]:
class NeuMF(torch.nn.Module):
    def __init__(self, config):
        super(NeuMF, self).__init__()
        # config 
        self.config = config
        self.num_users = config['num_users']
        self.num_items = config['num_items']
        self.latent_dim_mf = config['latent_dim_mf']
        self.latent_dim_mlp = config['latent_dim_mlp']
        # Embedding setting
        self.embedding_user_mlp = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mlp)
        self.embedding_item_mlp = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mlp)
        self.embedding_user_mf = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim_mf)
        self.embedding_item_mf = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim_mf)
        # MLP layer
        self.fc_layers = torch.nn.ModuleList()
        for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
            self.fc_layers.append(torch.nn.Linear(in_size, out_size))
        # output layer
        self.affine_output = torch.nn.Linear(in_features=config['layers'][-1] + config['latent_dim_mf'], out_features=1)
        self.logistic = torch.nn.Sigmoid()

    def forward(self, user_indices, item_indices):
        user_embedding_mlp = self.embedding_user_mlp(user_indices)
        item_embedding_mlp = self.embedding_item_mlp(item_indices)
        user_embedding_mf = self.embedding_user_mf(user_indices)
        item_embedding_mf = self.embedding_item_mf(item_indices)
        # MLP, MF
        mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
        mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)

        # MLP feed
        for idx, _ in enumerate(range(len(self.fc_layers))):
            mlp_vector = self.fc_layers[idx](mlp_vector)
            mlp_vector = torch.nn.ReLU()(mlp_vector)
        # concat MLP & MF
        vector = torch.cat([mlp_vector, mf_vector], dim=-1)
        # prediction
        logits = self.affine_output(vector)
        rating = self.logistic(logits)
        return rating

In [14]:
config = {
    'num_users': 6040,
    'num_items': 3706,
    'latent_dim_mf': 8,
    'latent_dim_mlp': 16,
    'layers': [32, 16, 8]
}
model = NeuMF(config)
model.load_state_dict(torch.load('./model/ncf_mlm'))

<All keys matched successfully>

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Predict test data

In [32]:
user_pred_info = {}
top = 10

def test_model(model, test_loader):
    # eval mode
    model.eval()
    user_pred_info = defaultdict(list)
    with torch.no_grad():
        with tqdm(test_loader, unit='batch') as tepoch:
            for samples in tepoch:
                user_items, y = samples[0], samples[1]
                user_items, y = user_items.to(device), y.to(device)
                # user=0, item=1
                y_pred = model(user_items[:, 0], user_items[:, 1])
                for user_item, p in zip(user_items, y_pred):
                    # save model predict result
                    user_pred_info[int(user_item[0])].append((int(user_item[1]), float(p)))
    return user_pred_info

In [33]:
ncf_user_pred_info = test_model(model, test_dataloader)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 391/391 [00:04<00:00, 93.74batch/s]


# Get Ranked list

In [34]:
for user, data_info in tqdm(ncf_user_pred_info.items(), total=len(ncf_user_pred_info), position=0, leave=True):
    # sorted by high prop and slice by top(10)
    ranklist = sorted(data_info, key=lambda s : s[1], reverse=True)[:top]
    # to list
    ranklist = list(dict.fromkeys([r[0] for r in ranklist]))
    user_pred_info[str(user)] = ranklist

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6035/6035 [00:00<00:00, 98230.88it/s]


In [88]:
for user, recom_list in user_pred_info.items():
    print(f"사용자 : {user}의 추천 리스트 : {recom_list}")
    break

사용자 : 2741의 추천 리스트 : [3613, 3306, 191, 2855, 957, 996, 2577]


# Sampling random users

In [45]:
random_user_origin = random.sample(list(user_pred_info.keys()), 1)
sample_user_pred_info = user_pred_info[random_user_origin[0]]
random_user = list(map(int, random_user_origin))
random_user = label_encoders['user_id'].inverse_transform(random_user)
random_user = list(map(str, random_user))
random_user

['1662']

In [46]:
sample_user_pred_info_trans = list(map(int, sample_user_pred_info)) 
sample_user_pred_info_trans = label_encoders['movie_id'].inverse_transform(sample_user_pred_info_trans)
sample_user_pred_info_trans

array(['296', '1580', '3943', '3863'], dtype=object)

In [47]:
movie_info[movie_info['movie_id'].isin(sample_user_pred_info_trans)]

Unnamed: 0,movie_id,title,movie_decade,genre
293,296,Pulp Fiction,1990s,Crime
1539,1580,Men in Black,1990s,Action
3793,3863,"Cell, The",2000s,Sci-Fi
3873,3943,Bamboozled,2000s,Comedy


In [48]:
sample_user_history = movielens_rcmm_origin[movielens_rcmm_origin['user_id'] == random_user[0]].fillna('non data')
print(sample_user_history.shape)
sample_user_history.head()

(25, 15)


Unnamed: 0,user_id,movie_id,movie_decade,movie_year,rating_year,rating_month,rating_decade,genre1,genre2,genre3,gender,age,occupation,zip,label
962948,1662,527,1990s,1993,2000,11,2000s,Drama,War,non data,M,25,12,94121,1
962949,1662,2762,1990s,1999,2000,11,2000s,Thriller,non data,non data,M,25,12,94121,1
962950,1662,1259,1980s,1986,2000,11,2000s,Adventure,Comedy,Drama,M,25,12,94121,1
962951,1662,589,1990s,1991,2000,11,2000s,Action,Sci-Fi,Thriller,M,25,12,94121,1
962952,1662,2858,1990s,1999,2000,11,2000s,Comedy,Drama,non data,M,25,12,94121,1


# Set user info by history --> to LLM input

In [49]:
# Recent user info
recent_ratio = int(sample_user_history.shape[0] * 0.1)
user_data = movielens_rcmm_origin[movielens_rcmm_origin['user_id'] == random_user[0]].fillna('non data')[['movie_decade', 'movie_year', 'rating_year', 'rating_decade', 'genre1', 'genre2', 'gender', 'age', 'zip']].values[:recent_ratio]
recent_user_hist_info = "#### Item interaction information\n\n- (item) : metadata information of items \n- (user) : metadata information of users"
for cnt, rows in enumerate(user_data):
    recent_user_hist_info += f"\n\n{cnt+1}th.\n- (Item) Movie Release Decade (ex. 1990s movies): {rows[0]}\n- (Item) Movie Release Year: {rows[1]}\n- (User) Rating Year: {rows[2]}\n- (User) Rating Decade (e.g., 1990s ratings): {rows[3]}\n- (Item) Genre 1: {rows[4]}\n- (Item) Genre 2: {rows[5]}\n- (User) Gender: {rows[6]}\n- (User) Age: {rows[7]}\n- (User) Address Information (zipcode): {rows[8]}\n##### End of {cnt+1}th item interaction information"

In [51]:
# Entire user history information
user_data = movielens_rcmm_origin[movielens_rcmm_origin['user_id'] == random_user[0]].fillna('non data')[['movie_decade', 'movie_year', 'rating_year', 'rating_decade', 'genre1', 'genre2', 'gender', 'age', 'zip']].values
user_all_hist_info = "#### Item interaction information\n\n- (item) : metadata information of items \n- (user) : metadata information of users"
for cnt, rows in enumerate(user_data):
    user_all_hist_info += f"\n\n{cnt+1}th.\n- (Item) Movie Release Decade (ex. 1990s movies): {rows[0]}\n- (Item) Movie Release Year: {rows[1]}\n- (User) Rating Year: {rows[2]}\n- (User) Rating Decade (e.g., 1990s ratings): {rows[3]}\n- (Item) Genre 1: {rows[4]}\n- (Item) Genre 2: {rows[5]}\n- (User) Gender: {rows[6]}\n- (User) Age: {rows[7]}\n- (User) Address Information (zipcode): {rows[8]}\n##### End of {cnt+1}th item interaction information"

In [52]:
print(recent_user_hist_info)

#### Item interaction information

- (item) : metadata information of items 
- (user) : metadata information of users

1th.
- (Item) Movie Release Decade (ex. 1990s movies): 1990s
- (Item) Movie Release Year: 1993
- (User) Rating Year: 2000
- (User) Rating Decade (e.g., 1990s ratings): 2000s
- (Item) Genre 1: Drama
- (Item) Genre 2: War
- (User) Gender: M
- (User) Age: 25
- (User) Address Information (zipcode): 94121
##### End of 1th item interaction information

2th.
- (Item) Movie Release Decade (ex. 1990s movies): 1990s
- (Item) Movie Release Year: 1999
- (User) Rating Year: 2000
- (User) Rating Decade (e.g., 1990s ratings): 2000s
- (Item) Genre 1: Thriller
- (Item) Genre 2: non data
- (User) Gender: M
- (User) Age: 25
- (User) Address Information (zipcode): 94121
##### End of 2th item interaction information


In [55]:
print(user_all_hist_info[:1500])

#### Item interaction information

- (item) : metadata information of items 
- (user) : metadata information of users

1th.
- (Item) Movie Release Decade (ex. 1990s movies): 1990s
- (Item) Movie Release Year: 1993
- (User) Rating Year: 2000
- (User) Rating Decade (e.g., 1990s ratings): 2000s
- (Item) Genre 1: Drama
- (Item) Genre 2: War
- (User) Gender: M
- (User) Age: 25
- (User) Address Information (zipcode): 94121
##### End of 1th item interaction information

2th.
- (Item) Movie Release Decade (ex. 1990s movies): 1990s
- (Item) Movie Release Year: 1999
- (User) Rating Year: 2000
- (User) Rating Decade (e.g., 1990s ratings): 2000s
- (Item) Genre 1: Thriller
- (Item) Genre 2: non data
- (User) Gender: M
- (User) Age: 25
- (User) Address Information (zipcode): 94121
##### End of 2th item interaction information

3th.
- (Item) Movie Release Decade (ex. 1990s movies): 1980s
- (Item) Movie Release Year: 1986
- (User) Rating Year: 2000
- (User) Rating Decade (e.g., 1990s ratings): 2000s
-

# Summay user history

In [56]:
print(len(user_all_hist_info))

9011


In [57]:
docs = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=100)
texts = text_splitter.split_text(user_all_hist_info)
docs += [Document(page_content=t) for t in texts]

In [58]:
template = '''Below is the user's past history information. Considering the user's main characteristics, persona, preferences, and meaningful patterns, please summarize the user information within 700 characters.\n\n##### User history information: {text}.'''

prompt = PromptTemplate(template=template, input_variables=['text'])

llm = ChatOpenAI(temperature=0, model='gpt-4o')


In [59]:
chain = load_summarize_chain(llm, 
                             chain_type='map_reduce', 
                             map_prompt=prompt, combine_prompt=prompt,
                             verbose=False)
summary = chain.run(docs)

  warn_deprecated(


In [60]:
summary

'The user is a 25-year-old male residing in the 94121 zip code area. He has a strong preference for movies from the 1990s, with specific interests in various years such as 1990, 1991, 1993, 1994, 1996, 1997, 1998, and 1999. His favorite genres include drama, thriller, comedy, romance, action, adventure, and sci-fi. He tends to rate movies primarily in the early 2000s, suggesting a nostalgic inclination towards films from his formative years. This user enjoys revisiting and evaluating films from the past, reflecting a blend of nostalgia and a methodical approach to his movie-watching habits.'

# Get user persona and characteristics

In [62]:

template = """Below is the user's item interaction history information. Using this data, please derive the user's main characteristics, persona, preferences, and meaningful patterns.

# User history information
{user_hist}

Please output in the following format:

- Main characteristics of the user: string
- User persona: string
- User preferences: string
- Meaningful patterns of the user: string

"""
prompt = PromptTemplate(template=template, input_variables=['user_hist'])


In [63]:
llm = ChatOpenAI(temperature=0, model='gpt-4o')
chain = LLMChain(llm=llm, prompt=prompt)
user_recent_summary = chain.invoke({'user_hist': recent_user_hist_info})

In [64]:
user_recent_summary

{'user_hist': '#### Item interaction information\n\n- (item) : metadata information of items \n- (user) : metadata information of users\n\n1th.\n- (Item) Movie Release Decade (ex. 1990s movies): 1990s\n- (Item) Movie Release Year: 1993\n- (User) Rating Year: 2000\n- (User) Rating Decade (e.g., 1990s ratings): 2000s\n- (Item) Genre 1: Drama\n- (Item) Genre 2: War\n- (User) Gender: M\n- (User) Age: 25\n- (User) Address Information (zipcode): 94121\n##### End of 1th item interaction information\n\n2th.\n- (Item) Movie Release Decade (ex. 1990s movies): 1990s\n- (Item) Movie Release Year: 1999\n- (User) Rating Year: 2000\n- (User) Rating Decade (e.g., 1990s ratings): 2000s\n- (Item) Genre 1: Thriller\n- (Item) Genre 2: non data\n- (User) Gender: M\n- (User) Age: 25\n- (User) Address Information (zipcode): 94121\n##### End of 2th item interaction information',
 'text': "- Main characteristics of the user: The user is a 25-year-old male living in the 94121 zip code area. He rated movies in t

# Explainbilty

In [66]:
user_recom_result = movie_info[movie_info['movie_id'].isin(sample_user_pred_info_trans)]
user_recom_result

Unnamed: 0,movie_id,title,movie_decade,genre
293,296,Pulp Fiction,1990s,Crime
1539,1580,Men in Black,1990s,Action
3793,3863,"Cell, The",2000s,Sci-Fi
3873,3943,Bamboozled,2000s,Comedy


In [67]:
user_recent_summary_info = user_recent_summary['text']
user_entire_summary_info = summary

In [70]:
user_data = user_recom_result[['title', 'movie_decade', 'genre']].values

user_recom_info = "#### User Recommendation List\n\n"
for cnt, rows in enumerate(user_data):
    user_recom_info += f"\n\nRecommendation {cnt+1}:\n- Item Title: {rows[0]}\n- (Item) Movie Release Decade (e.g., 1990s movie): {rows[1]}\n- Item Genre (Category): {rows[2]}\n##### End of Recommendation {cnt+1} Information"


In [71]:
user_recom_info

'#### User Recommendation List\n\n\n\nRecommendation 1:\n- Item Title: Pulp Fiction\n- (Item) Movie Release Decade (e.g., 1990s movie): 1990s\n- Item Genre (Category): Crime\n##### End of Recommendation 1 Information\n\nRecommendation 2:\n- Item Title: Men in Black\n- (Item) Movie Release Decade (e.g., 1990s movie): 1990s\n- Item Genre (Category): Action\n##### End of Recommendation 2 Information\n\nRecommendation 3:\n- Item Title: Cell, The\n- (Item) Movie Release Decade (e.g., 1990s movie): 2000s\n- Item Genre (Category): Sci-Fi\n##### End of Recommendation 3 Information\n\nRecommendation 4:\n- Item Title: Bamboozled\n- (Item) Movie Release Decade (e.g., 1990s movie): 2000s\n- Item Genre (Category): Comedy\n##### End of Recommendation 4 Information'

In [73]:
template = """The data below contains the user's main characteristics, persona, and preference information. There is preference information based on the entire history and also based on the last 10 interactions.

#### Main characteristics based on the entire history
{user_entire_summary_info}

#### Main characteristics based on the last 10 interactions
{user_recent_summary_info}

Below is the item information recommended by the recommendation system for the above user.

#### Recommendation results provided by the recommendation system
{recom_list}

Your role is to write the reason for the recommendation by comparing the user's main characteristics information with the recommendation results provided by the recommendation system.
The recommendation results are a list of items provided by the recommendation system based on the user's past interaction information.
If you determine that the reason for the recommendation is inappropriate, please say, 'It does not seem to be an appropriate recommendation' and also provide the reason why it is not appropriate.

To summarize your role:

- Consider the user information (main characteristics based on the entire history, main characteristics based on the last 10 interactions)
- The recommendation results are a recommendation list provided by the recommendation system based on the user's past interactions
- Write the reason for the recommendation by referring to the recommendation results and user information
- If the reason for the recommendation is inappropriate, say 'It does not seem to be an appropriate recommendation' and explain why it is not appropriate
- Do not include unnecessary words, perform the requested task and respond
- If you are unsure, think it over and if you really don't know, respond with 'I don't know'
"""

prompt = PromptTemplate(template=template, input_variables=['user_entire_summary_info', 'user_recent_summary_info', 'recom_list'])


In [74]:
llm = ChatOpenAI(temperature=0, model='gpt-4o')
chain = LLMChain(llm=llm, prompt=prompt)
recommend_explain = chain.invoke({'user_entire_summary_info': user_entire_summary_info, 'user_recent_summary_info':user_recent_summary_info, 'recom_list':user_recom_info})

In [75]:
print(recommend_explain['text'])

#### Recommendation 1: Pulp Fiction
- **Reason for Recommendation:** "Pulp Fiction" is a 1990s movie, aligning with the user's strong preference for films from that decade. The genre is Crime, which, while not explicitly listed in the user's favorite genres, often overlaps with Drama and Thriller, both of which the user enjoys. The intense and complex narrative of "Pulp Fiction" fits the user's interest in thought-provoking and emotionally engaging content.

#### Recommendation 2: Men in Black
- **Reason for Recommendation:** "Men in Black" is a 1990s movie, which matches the user's preference for that decade. The genre is Action, one of the user's favorite genres. This recommendation aligns well with the user's interest in 1990s films and action-packed narratives.

#### Recommendation 3: The Cell
- **It does not seem to be an appropriate recommendation**
- **Reason:** "The Cell" is a 2000s movie, which does not align with the user's strong preference for 1990s films. Although the genr