In [None]:
import os
# Change file here
pwd = os.getcwd()
train_file = f"{pwd}/data/training_set.csv"
public_test_file = f"{pwd}data/public_testset.csv"
private_test_file = f"{pwd}/data/test_set_private.csv"

In [None]:
# Store runs predict path
out_path = f"{pwd}/runs/private-test-attempt-final"
import os 
os.makedirs(out_path, exist_ok=True)

In [None]:
!nvidia-smi

In [None]:
# Choose GPU to use
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# SELFRec Models

In [None]:
%cd $pwd/SELFRec/

In [None]:
import os

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from base.torch_interface import TorchGraphInterface
from base.graph_recommender import GraphRecommender
from data.loader import FileIO
from util.conf import ModelConf
from util.sampler import next_batch_pairwise
from util.loss_torch import bpr_loss, l2_reg_loss, InfoNCE

from model.graph.LightGCN import *
from model.graph.XSimGCL import *
from model.graph.DirectAU import *
from model.graph.SimGCL import *
from SELFRec import SELFRec

## Get result

In [None]:
predict_df = pd.read_csv(private_test_file, names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
predict_df.head()

In [None]:
model_path = {
    "LightGCN": f"{pwd}/runs/LightGCN/model.pkl",
    "XSimGCL": f"{pwd}/runs/XSimGCL/model.pkl",
    "DirectAU": f"{pwd}/runs/DirectAU/model.pkl",
    "SimGCL": f"{pwd}/runs/SimGCL/model.pkl"
}

In [None]:
from util.algorithm import find_k_largest
def test(self):
    def process_bar(num, total):
        rate = float(num) / total
        ratenum = int(50 * rate)
        print(f'\rProgress: [{"+" * ratenum}{" " * (50 - ratenum)}]{ratenum * 2}%', end='', flush=True)

    rec_list = {}
    data_train = pd.DataFrame(self.data.training_data, columns= ['uid', 'iid', 'rating'])
    self.data.train_set = data_train[data_train['uid'].isin(test_user_id)].values.tolist()
    user_count = len(self.data.train_set)
    
    for i, user in enumerate(self.data.train_set):
        user = user[0]
        candidates = self.predict(user)
        rated_list, _ = self.data.user_rated(user)
        for item in rated_list:
            candidates[self.data.item[item]] = -10e8
        # ids, scores = find_k_largest(1000, candidates)
        item_names = predict_df[predict_df.user_id == user].values[0][1:]
        scores = []
        for item in item_names:
            try:
                id_tmp = self.data.item[item]
                scores.append(candidates[id_tmp])
            except:
                # Cần sửa khuyến nghị cold start
                scores.append(0)
        
        sorted_list = sorted(list(zip(item_names, scores)), key=lambda x: x[1], reverse=True)
        rec_list[user] = sorted_list
        if i % 1000 == 0:
            process_bar(i, user_count)
    process_bar(user_count, user_count)
    print('')
    return rec_list

In [None]:
import pickle
import numba
import gc
import torch
for model, path in model_path.items():
    with open(path, "rb") as f:
        rec = pickle.load(f)

    rec_list = test(rec)

    data = []
    for user_id in test_user_id:
        data.append([user_id] + [i[0] for i in rec_list[user_id]])

    pd.DataFrame(data).to_csv(f'{out_path}/{model}_predict.csv', index = False, header=False)

    del rec
    gc.collect()  # collecting garbage
    torch.cuda.empty_cache()  # cleaning GPU cache

# RecVAE

In [None]:
%cd $pwd

In [None]:
vae_checkpoint = f"{{pwd}}/runs/RecVAE"

In [None]:
df = pd.read_csv(f"{vae_checkpoint}/data.csv")

In [None]:
test = pd.read_csv(f"{private_test_file}", header=None)

In [None]:
id2item_df = pd.read_csv(f"{vae_checkpoint}/unique_sid.txt", header=None).rename(columns={0: "ItemId"}).reset_index()

In [None]:
test_tr = df[df["UserId"].isin(test[0].values)]
test_tr = test_tr.merge(id2item_df, how="inner", on="ItemId").rename(columns={"index": "sid"})
user_test = pd.DataFrame(test_tr["UserId"].unique(), columns=["UserId"]).reset_index().rename(columns={"index" : "uid"})
test_tr = pd.merge(test_tr, user_test, on="UserId")
test_tr[["uid", "sid"]].to_csv(f"{vae_checkpoint}/testset_recvae.csv", index=False)

In [None]:
!python recvae/infer.py --dataset $vae_checkpoint --hidden-dim 3072 --latent-dim 2048 --infer_data $vae_checkpoint/testset_recvae.csv --model_path $vae_checkpoint/model.pt 

In [None]:
import pickle
with open(f"{vae_checkpoint}/result_csp.pkl", "rb") as f:
    result = pickle.load(f)

id2profile = dict(user_test.values)
profile2id = {value: key for key, value in id2profile.items()}
id2item = dict(id2item_df.values)
item2id = {value: key for key, value in id2item.items()}

In [None]:
import numpy as np
from tqdm.notebook import tqdm
return_list = []
for r in tqdm(test.merge(user_test, left_on=0, right_on="UserId", how="left").values):
    userid = r[0]
    list_item = r[1:1001]
    uid = r[1001]
    
    # Process User not in Test By get the default list
    if np.isnan(uid):
        return_list.append([userid, *list_item])
        continue

    # Convert ItemId to indexs of sparse vector
    item_indexs = []
    for l in list_item:
        try:
            item_indexs.append((item2id[l], l))
        except:
            pass

    # Score and sorted to get to recommend item
    scored_item = []
    indexes, itemids = map(list,zip(*item_indexs))
    for item, score in zip(itemids, result[int(uid)][indexes]):
        scored_item.append((item, score))

    scored_item = sorted(scored_item, key=lambda x : x[1], reverse=True)
    recommend_list, _ = map(list,zip(*scored_item))

    # Append to return list to make submit
    return_list.append([userid, *recommend_list])

In [None]:
return_df = pd.DataFrame(return_list)

In [None]:
return_df

In [None]:
# Fill remanining cells as nan value to make submit file eligible
for i in range(len(return_df.columns), 1001):
    return_df[i] = np.nan

In [None]:
return_df.fillna("0").to_csv(f"{out_path}/predict_RecVAE.csv", header=None, index=False)

# ALS

In [None]:
als_checkpoint = f"{pwd}/runs/ALS/"

In [None]:
import pickle
   
with open(f"{als_checkpoint}/model.pkl", "rb") as f:
    model = pickle.load(f)

with open(f"{als_checkpoint}/usermap.pkl", "rb") as f:
    user_map = pickle.load(f)

with open(f"{als_checkpoint}/itemmap.pkl", "rb") as f:
    item_map = pickle.load(f)

with open(f"{als_checkpoint}/csr_train.pkl", "rb") as f:
    csr_train = pickle.load(f)

user_ids = {v:k for k, v in user_map.items()}
item_ids = {v:k for k, v in item_map.items()}

In [None]:
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
from implicit.evaluation import ndcg_at_k
from implicit.gpu import matrix_factorization_base

In [None]:
def submit(model, csr_train, test_user_id, item_names, submission_name="predict_BPR.csv"):
    preds = []
    batch_size = 2000
    # Make sure we're only predicting for users in test_user_id
    to_generate = np.array([user_id for user_id in test_user_index])  # Make sure users exist in user_ids
    
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx:startidx + batch_size]
        # print(batch)
        ids, scores = model.recommend(batch, csr_train[batch], N=1000, filter_already_liked_items=True)
        
        for i, userid in enumerate(batch):
            customer_id = user_ids[userid]
            user_items = ids[i]
            
            # Filter the items to keep only those in item_names for the current user
            valid_item_ids = [item for item in user_items if item_ids[item] in item_names[customer_id]]
            
            # If fewer than 1000 items are valid, fill the rest with invalid items or random items
            # You can adjust the fill logic if needed, here it's just taking invalid items or the first items.
            invalid_items = [item for item in user_items if item_ids[item] not in item_names[customer_id]]
            filled_items = valid_item_ids + invalid_items[:(1000 - len(valid_item_ids))]
            
            # Ensure we have exactly 1000 items
            article_ids = [item_ids[item_id] for item_id in filled_items[:1000]]
            
            preds.append([customer_id] + article_ids)
    
    # Create the DataFrame for submission
    df_preds = pd.DataFrame(preds, columns=['customer_id'] + [f'item_{i}' for i in range(1000)])
    df_preds.to_csv(submission_name, index=False, header=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds


In [None]:
test = pd.read_csv(f"{pwd}/data/test_set_private.csv", names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
test_user_id = test['user_id'].values

test['user_index'] = test['user_id'].map(user_map)
test_user_index = test['user_index'].values

item_names = {}
for user_id in test_user_id:
    item_names[user_id] = test[test.user_id == user_id].values[0][1:] 

In [None]:
%%time

df_preds = submit(model, csr_train, test_user_id, item_names);

In [None]:
df_preds.fillna("0").to_csv(f"{out_path}/predict_ALS.csv", header=None, index=False)

# SAR

In [None]:
%cd $pwd

In [None]:
test_df = pd.read_csv('data/test_set_private.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
test_user_id = test_df['user_id'].values

test_df['user_id'] = test_df['user_id'].astype('category')
for i in range(1,1001):
    test_df[f'item_id_{i}'] = test_df[f'item_id_{i}'].astype('category')

test_df['UserId'] = test_df['user_id']

In [None]:
import pickle
with open("runs/SAR/model.pkl", "rb") as f
    model = pickle.load(f)

In [None]:
def batch_generator(data, batch_size):
    for start in range(0, len(data), batch_size):
        end = min(start + batch_size, len(data))
        yield data[start:end]

batch_size = 1000  # Adjust batch size as needed
all_recommendations = []

for user_batch in batch_generator(test_df['UserId'].unique(), batch_size):
    # Filter the test data for the current user batch
    user_batch_data = test_df[test_df['UserId'].isin(user_batch)]
    recommendations = model.recommend_k_items(user_batch_data, top_k=1200, remove_seen=True)
    all_recommendations.append(recommendations)

# Combine all recommendations
all_recommendations = pd.concat(all_recommendations)

In [None]:
all_recommendations.to_csv('DS/data/SAR_recommendations_no_timestamp_tail19.csv', index=False)

In [None]:
import numpy as np
from tqdm import tqdm

preds = []  # Result storage

for user_id in tqdm(test_user_id):
    # Retrieve item names and candidate pairs for this user
    item_names = predict_df[predict_df.user_id == user_id].values[0][1:]
    candidates_array = all_recommendations[all_recommendations.UserId == user_id][['ItemId', 'prediction']].values
    candidates = {item: score for item, score in candidates_array}
    
    scores = [
        candidates.get(item, 1e-8) if item in candidates else 1e-8
        for item in item_names
    ]
    
    # Sort items based on score in descending order
    sorted_list = sorted(zip(item_names, scores), key=lambda x: x[1], reverse=True)

    preds.append([user_id] + [i[0] for i in sorted_list])

In [None]:
pd.DataFrame(preds).to_csv(f'{out_path}/predict_SAR.csv', index = False, header=False)

# Ensemble Phase 1

In [None]:
!/opt/conda/envs/rapids-24.10/bin/ipython Rerank.ipynb

In [None]:
!cp $pwd/runs/reranking/predict.csv $out_path/predict_RRK.csv

In [None]:
sub1 = pd.read_csv(f'{out_path}/XSimGCL_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub2 = pd.read_csv(f'{out_path}/RecVAE_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub3 = pd.read_csv(f'{out_path}/SimGCL_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub4 = pd.read_csv(f'{out_path}/LightGCN_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub5 = pd.read_csv(f'{out_path}/DirectAU_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub6 = pd.read_csv(f'{out_path}/ALS_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub7 = pd.read_csv(f'{out_path}/SAR_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])


sub = pd.DataFrame()
sub['user_id'] = sub1['user_id']
sub1['prediction0'] = sub1.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub2['prediction1'] = sub2.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub3['prediction2'] = sub3.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub4['prediction3'] = sub4.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub5['prediction4'] = sub5.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub6['prediction5'] = sub6.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub7['prediction6'] = sub7.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)


sub['prediction0'] = sub1['prediction0']
sub['prediction1'] = sub2['prediction1']
sub['prediction2'] = sub3['prediction2']
sub['prediction3'] = sub4['prediction3']
sub['prediction4'] = sub5['prediction4']
sub['prediction5'] = sub6['prediction5']
sub['prediction6'] = sub7['prediction6']


In [None]:
# Ensemble dựa trên RRF
def cust_blend(dt, W = [2, 2, 1.2, 1.5, 1.5, 1, 1]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    REC.append(dt['prediction3'].split())
    REC.append(dt['prediction4'].split())
    REC.append(dt['prediction5'].split())
    REC.append(dt['prediction6'].split())
    REC.append(dt['prediction7'].split())
    REC.append(dt['prediction8'].split())   

    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:1000])

sub['prediction'] = sub.apply(lambda x: cust_blend(x), axis=1)
sub.head()

In [None]:
sub = sub[['user_id', 'prediction']]
value_lists = sub['prediction'].str.split(" ")

In [None]:
final = pd.DataFrame(value_lists.tolist(), index=sub['user_id']).reset_index()
final.to_csv(f'{out_path}/predict_ensemble_7_file.csv', index = False, header = False)

In [None]:
!cp $out_path/predict_ensemble_7_file.csv $pwd/submission/

# Co-Visitation-Matrix

In [None]:
!cp $pwd/runs/co-visitation-matrix/predict.csv $out_path/predict_CVM.csv

# ReRanking

In [None]:
!/opt/conda/envs/rapids-24.10/bin/ipython Rerank.ipynb

In [None]:
!cp $pwd/runs/reranking/predict.csv $out_path/predict_RRK.csv

# Get Low Agreement user and User Cluster

## Low Agreement

In [None]:
import pandas as pd

recvae = pd.read_csv(f"{out_path}/RecVAE_predict.csv", header=None)
als = pd.read_csv(f"{out_path}/ALS_predict.csv", header=None)
lightgcn = pd.read_csv(f"{out_path}/LightGCN_predict.csv", header=None)
sar = pd.read_csv(f"{out_path}/SAR_predict.csv", header=None)
xsimgcl = pd.read_csv(f"{out_path}/XSimGCL_predict.csv", header=None)
directau = pd.read_csv(f"{out_path}/DirectAU_predict.csv", header=None)

In [None]:
df = pd.read_csv(f"{outpat}/predict_ensemble_7file.csv", header=None)

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict

def load_and_process_recommendations(file_paths):
    """
    Đọc và xử lý các file khuyến nghị từ nhiều mô hình
    """
    model_predictions = {}
    for model_name, file_path in file_paths.items():
        df = pd.read_csv(file_path, header=None)[[0,1,2,3]]
        # Chuyển đổi DataFrame thành dictionary với key là user_id và value là list các khuyến nghị
        predictions = {str(row[0]): list(row[1:]) for _, row in df.iterrows()}
        model_predictions[model_name] = predictions
    return model_predictions

def calculate_jaccard_similarity(set1, set2):
    """
    Tính toán độ tương đồng Jaccard giữa hai tập hợp
    """
    intersection = len(set(set1) & set(set2))
    union = len(set(set1) | set(set2))
    return intersection / union if union != 0 else 0

def analyze_model_agreement(model_predictions):
    """
    Phân tích độ đồng thuận giữa các mô hình
    """
    user_similarities = defaultdict(list)
    model_names = list(model_predictions.keys())
    
    # Với mỗi người dùng, tính toán độ tương đồng giữa các cặp mô hình
    for user_id in model_predictions[model_names[0]].keys():
        similarities = []
        # So sánh từng cặp mô hình
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                model1, model2 = model_names[i], model_names[j]
                recs1 = model_predictions[model1][user_id]
                recs2 = model_predictions[model2][user_id]
                similarity = calculate_jaccard_similarity(recs1, recs2)
                similarities.append(similarity)
        
        # Tính trung bình độ tương đồng cho người dùng này
        avg_similarity = np.mean(similarities)
        user_similarities[user_id] = avg_similarity
    
    return user_similarities

def get_extreme_cases(user_similarities, threshold_high=0.5, threshold_low=0.15 ):
    """
    Lấy ra các trường hợp có độ đồng thuận cao và thấp
    """
    high_agreement = {k: v for k, v in user_similarities.items() if v >= threshold_high}
    low_agreement = {k: v for k, v in user_similarities.items() if v <= threshold_low}
    
    return high_agreement, low_agreement

def analyze_recommendations():
    # Định nghĩa đường dẫn đến các file
    file_paths = {
        'RecVAE': f"{out_path}/RecVAE_predict.csv",
        'ALS': f"{out_path}/ALS_predict.csv",
        'LightGCN': f"{out_path}/LightGCN_predict.csv",
        'SAR': f"{out_path}/SAR_predict.csv",
        'XSimGCL': f"{out_path}/XSimGCL_predict.csv"
    }
    
    # Đọc và xử lý dữ liệu
    model_predictions = load_and_process_recommendations(file_paths)
    
    # Phân tích độ đồng thuận
    user_similarities = analyze_model_agreement(model_predictions)
    
    # Lấy ra các trường hợp đặc biệt
    high_agreement, low_agreement = get_extreme_cases(user_similarities)
    
    # In kết quả phân tích
    print(f"Tổng số người dùng: {len(user_similarities)}")
    print(f"Số người dùng có độ đồng thuận cao: {len(high_agreement)}")
    print(f"Số người dùng có độ đồng thuận thấp: {len(low_agreement)}")
    
    # In ra một vài ví dụ
    print("\nVí dụ về người dùng có độ đồng thuận cao:")
    for user_id, similarity in list(high_agreement.items())[:5]:
        print(f"User {user_id}: {similarity:.3f}")
        
    print("\nVí dụ về người dùng có độ đồng thuận thấp:")
    for user_id, similarity in list(low_agreement.items())[:5]:
        print(f"User {user_id}: {similarity:.3f}")
        
    return high_agreement, low_agreement, user_similarities

# Chạy phân tích
high_agreement, low_agreement, user_similarities = analyze_recommendations()

In [None]:
low_agreement_user = low_agreement.keys()

with open(f"{out_path}/low_agreement_user.txt", "w") as f:
    for key in low_agreement_user:
        f.write(key + "\n")


## New Item with Cluster

In [None]:
import pickle
import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
import pickle
import numpy as np
from sklearn.manifold import TSNE  # You can also use UMAP from cuML or sklearn
import plotly.express as px

# Load the user embeddings
embedding_file_path = 'runs/lightgcn/user_embedding.pkl'

with open(embedding_file_path, 'rb') as f:
    user_embeddings = pickle.load(f)

# Convert the embeddings into a NumPy array (make sure it's in the right shape)
embedding_matrix = np.array(list(user_embeddings.values()))

# Dimensionality Reduction using t-SNE
tsne = TSNE(n_components=2, random_state=42, init='pca', metric='euclidean', method="barnes_hut")
X_tsne = tsne.fit_transform(embedding_matrix)

# Optional: Get the t-SNE divergence (useful for diagnostic purposes)
print(f"t-SNE KL Divergence: {tsne.kl_divergence_}")

# Visualize the results using Plotly
# Create a DataFrame for easy plotting
import pandas as pd
df = pd.DataFrame(X_tsne, columns=["TSNE Component 1", "TSNE Component 2"])

# If you have any labels (e.g., cluster IDs or user types), you can add them to the DataFrame
# Example: df['Cluster'] = cluster_labels  # If you have cluster labels

# Create the Plotly scatter plot
fig = px.scatter(df, x="TSNE Component 1", y="TSNE Component 2",
                 title="User Latent Space (t-SNE)",
                 labels={"TSNE Component 1": "Dimension 1", "TSNE Component 2": "Dimension 2"},
                 template="plotly_dark")  # Optional: use dark theme

# Update layout for better readability
fig.update_layout(
    title="User Latent Space",
    xaxis_title="t-SNE Component 1",
    yaxis_title="t-SNE Component 2",
    showlegend=False  # Set to True if you want to display legends (e.g., for clusters)
)

# Show the interactive plot
fig.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

# Compute the k-nearest neighbors distances
min_samples = 5  # Typically, this is a small integer, like 5 or 10
neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(X_tsne)  # Use your 2D reduced data (e.g., t-SNE embeddings)
distances, indices = neighbors_fit.kneighbors(X_tsne)

# Sort the distances
distances = np.sort(distances[:, min_samples-1], axis=0)

# Plot the k-distance graph (elbow method)
plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.title(f'k-distance Graph (k={min_samples})')
plt.xlabel('Data points sorted by distance to k-th neighbor')
plt.ylabel(f'{min_samples}-th Nearest Neighbor Distance')
plt.grid(True)
plt.show()


In [None]:
dbscan = DBSCAN(eps=1.9, min_samples=5, metric='euclidean')  # You may need to adjust these hyperparameters
cluster_labels = dbscan.fit_predict(X_tsne)

# Add cluster labels to the DataFrame
df = pd.DataFrame(X_tsne, columns=["TSNE Component 1", "TSNE Component 2"])
df['Cluster'] = cluster_labels

# Visualize the result using Plotly
fig = px.scatter(df, x="TSNE Component 1", y="TSNE Component 2", color="Cluster",
                 title="User Latent Space with DBSCAN Clusters",
                 labels={"TSNE Component 1": "Dimension 1", "TSNE Component 2": "Dimension 2"},
                 template="plotly_dark")  # Optional: use "plotly_dark" or other templates

# Update layout for better readability
fig.update_layout(
    title="User Latent Space (DBSCAN Clusters)",
    xaxis_title="t-SNE Component 1",
    yaxis_title="t-SNE Component 2",
    showlegend=True  # Show the legend to display the cluster colors
)

# Show the interactive plot
fig.show()

In [None]:
user_ids = list(user_embeddings.keys())

In [None]:
df.to_csv('runs/lightgcn/user_clusters.csv', index=False)

## Append New Item by Cluster and Low Agreement

In [None]:
cluster = pd.read_csv('runs/lightgcn/user_clusters.csv')
cluster.head()

user_df = test.copy()
user_df = user_df.set_index('user_id')
cluster = cluster.set_index('User ID')
user_df = user_df.join(cluster)


In [None]:
from collections import Counter
def get_most_common_items_for_cluster(user_df, cluster_column='Cluster', item_columns=None):
    cluster_item_counts = {}

    # Iterate over each cluster
    for cluster in user_df[cluster_column].unique():
        # Get users in this cluster
        cluster_users = user_df[user_df[cluster_column] == cluster]
        
        # Get all item interactions for these users
        cluster_items = cluster_users[item_columns].values.flatten()
        
        # Count the frequency of each item (ignore NaN or empty interactions)
        cluster_items = [item for item in cluster_items if pd.notna(item)]
        item_counter = Counter(cluster_items)
        
        # Save the most common items in this cluster
        cluster_item_counts[cluster] = item_counter.most_common()
    
    return cluster_item_counts

item_columns = [f'item_id_{i+1}' for i in range(1000)] 
cluster_item_counts = get_most_common_items_for_cluster(user_df, item_columns=item_columns)

In [None]:
train = pd.read_csv("data/training_set.csv")
list_all_train_item = train["ItemId"].unique()
items_df = test[item_columns]
all_test_items = items_df.values.flatten()
all_test_items = set(all_test_items)
old_items = set(all_test_items).intersection(set(list_all_train_item))
new_items = all_test_items - old_items

In [None]:
new_items_dict = {}

for user_idx, user_id in enumerate(test["user_id"]):  # Assuming 'user_id' is the first column
    # Get the list of items the user has interacted with
    user_items = items_df.iloc[user_idx].values.tolist()
    
    # Filter out items that are NaN or empty
    user_items = [item for item in user_items if pd.notna(item) and item != '']
    
    # Find new items by checking which of the user's items are in `new_items`
    user_new_items = [item for item in user_items if item in new_items]
    
    # Add to the dictionary: user_id -> list of new items
    new_items_dict[user_id] = user_new_items


In [None]:
def recommend_items_for_user(user_id, cluster_item_counts, new_items_dict, user_df, item_columns, top_n=1000):
    # Get the user's cluster
    user_cluster = user_df.loc[user_id, 'Cluster']
    # Get the most common items in this cluster
    common_items = [(item, count) for item, count in cluster_item_counts[user_cluster]]
    common_items = pd.DataFrame(common_items, columns=['item_id', 'count'])
   
    # Get the new items for this user
    new_items = new_items_dict.get(user_id, [])
    recommended_items = common_items[common_items['item_id'].isin(new_items)]
    # print(recommended_items)
    recommended_items = recommended_items.sort_values(by='count', ascending=False)['item_id'].tolist()
    # Ensure the recommended list has 1000 items (pad with "empty" items if necessary)
    recommended_items += ["0"] * (top_n - len(recommended_items))  # Pad with empty strings
    # print(recommended_items)
    return recommended_items

# Create a list to hold all recommendations
recommendations = []

# Generate recommendations for each user
for user_id in user_df.index:
    recommended_items = recommend_items_for_user(user_id, cluster_item_counts, new_items_dict, user_df, item_columns)
    recommendations.append([user_id] + recommended_items)
# Convert the recommendations into a DataFrame
recommendations_df = pd.DataFrame(recommendations, columns=['user_id'] + [f'item_{i+1}' for i in range(1000)])

In [None]:
import pandas as pd

last_1000_columns = recommendations_df.iloc[:, -1000:]

# Count rows where there is at least one non-zero value in the last 1000 columns
non_zero_rows = (last_1000_columns != "0").any(axis=1)

# Count the number of such rows
count_non_zero_rows = non_zero_rows.sum()

print("Number of rows with non-zero values in the last 1000 columns:", count_non_zero_rows)


In [None]:
recommendations_df.to_csv('submission/predict_new.csv', header=None, index=None)

In [None]:
best = pd.read_csv(f'{out_path}/LightGCN_predict.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

In [None]:
# Read keys from a file into a list
with open(f"{out_path}/low_agreement_user.txt", "r") as f:
    keys = [line.strip() for line in f.readlines()]
len(keys)


In [None]:
# best.loc[recommendations_df['item_1'] != "0", 'item_id_10'] = recommendations_df['item_1']
count = 0
for idx, row in best.iterrows():
    if row['user_id'] in keys:
        # Get the corresponding recommendation for this user
        recommendation = recommendations_df.loc[recommendations_df['user_id'] == row['user_id'], 'item_1'].values
        
        if recommendation and recommendation[0] != '0':  # Check if the recommendation is not '0'
            count += 1
        #     for i in range(9, 4, -1):  # Start from item_id_9 and shift down to item_id_2
        #         best.loc[idx, f'item_id_{i+1}'] = best.loc[idx, f'item_id_{i}']
            # Replace item_id_1 with item_1 from recommendations_df
            best.loc[idx, 'item_id_10'] = recommendation[0]
print(count)

In [None]:
best.to_csv(f'out_path}/LightGCN_new_cluster_top10.csv', header=None, index=None)


# Ensemble Final

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
sub1 = pd.read_csv(f'{out_path}/predict_XSimGCL.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub2 = pd.read_csv(f'{out_path}/predict_RecVAE.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub3 = pd.read_csv(f'{out_path}/predict_SimGCL.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub4 = pd.read_csv(f'{out_path}/LightGCN_new_cluster_top10.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub5 = pd.read_csv(f'{out_path}/predict_DirectAU.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub6 = pd.read_csv(f'{out_path}/predict_ALS.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub7 = pd.read_csv(f'{out_path}/predict_SAR.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub8 = pd.read_csv(f'{out_path}/predict_RRK.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
sub9 = pd.read_csv(f'{out_path}/predict_ensemble_7file.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

# sub1 = pd.read_csv(f'runs/private-test-attempt/predict_XSim_ensemble_noscore.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub2 = pd.read_csv(f'runs/private-test-attempt/RecVAE_05981.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub3 = pd.read_csv(f'runs/private-test-attempt/SimGCL_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub4 = pd.read_csv(f'runs/private-test-attempt/LightGCN_new_cluster_top10.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub5 = pd.read_csv(f'runs/private-test-attempt/DirectAU_predict.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub6 = pd.read_csv(f'runs/private-test-attempt/ALS_new_cluster_top10.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub7 = pd.read_csv(f'runs/private-test-attempt/SAR_std_data.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub8 = pd.read_csv(f'runs/private-test-attempt/predict_full_rerank.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])
# sub9 = pd.read_csv(f'runs/private-test-attempt/predict_ensemble_8file.csv', names = ['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

sub = pd.DataFrame()
sub['user_id'] = sub1['user_id']
sub1['prediction0'] = sub1.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub2['prediction1'] = sub2.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub3['prediction2'] = sub3.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub4['prediction3'] = sub4.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub5['prediction4'] = sub5.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub6['prediction5'] = sub6.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub7['prediction6'] = sub7.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub8['prediction7'] = sub8.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)
sub9['prediction8'] = sub9.apply(lambda row: ' '.join(str(row[f'item_id_{i+1}']) for i in range(1000)), axis=1)


sub['prediction0'] = sub1['prediction0']
sub['prediction1'] = sub2['prediction1']
sub['prediction2'] = sub3['prediction2']
sub['prediction3'] = sub4['prediction3']
sub['prediction4'] = sub5['prediction4']
sub['prediction5'] = sub6['prediction5']
sub['prediction6'] = sub7['prediction6']
sub['prediction7'] = sub8['prediction7']
sub['prediction8'] = sub9['prediction8']


In [None]:
# Ensemble dựa trên RRF
def cust_blend(dt, W = [2, 2, 1.2, 1.5, 1.5, 1, 1, 1, 2]):
    #Global ensemble weights
    #W = [1.15,0.95,0.85]
    
    #Create a list of all model predictions
    REC = []
    REC.append(dt['prediction0'].split())
    REC.append(dt['prediction1'].split())
    REC.append(dt['prediction2'].split())
    REC.append(dt['prediction3'].split())
    REC.append(dt['prediction4'].split())
    REC.append(dt['prediction5'].split())
    REC.append(dt['prediction6'].split())
    REC.append(dt['prediction7'].split())
    REC.append(dt['prediction8'].split())     

    #Create a dictionary of items recommended. 
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))
    
    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    
    # Return the top 12 itens only
    return ' '.join(res[:1000])

sub['prediction'] = sub.apply(lambda x: cust_blend(x), axis=1)
sub.head()

In [None]:
sub = sub[['user_id', 'prediction']]
value_lists = sub['prediction'].str.split(" ")

In [None]:
final = pd.DataFrame(value_lists.tolist(), index=sub['user_id']).reset_index()
final.to_csv('submission/predict.csv', index = False, header = False)

In [None]:
!cd submission && zip CHAMPION_FINAL_SUBMISSION.zip predict.csv