In [2]:
import pandas as pd

In [4]:
recvae = pd.read_csv("runs/private-test-attempt-1/RecVAE_0581.csv", header=None)
als = pd.read_csv("runs/private-test-attempt-1/predict_ALS.csv", header=None)
lightgcn = pd.read_csv("runs/private-test-attempt-1/LightGCN_predict.csv", header=None)
sar = pd.read_csv("runs/private-test-attempt-1/SAR_std_data.csv", header=None)
xsimgcl = pd.read_csv("submission/predict_ensemble_XSim_multiview.csv", header=None)
cl4rec = pd.read_csv("runs/private-test-attempt-1/DirectAU_predict.csv", header=None)

In [8]:
import pandas as pd
import numpy as np
from collections import defaultdict

def load_and_process_recommendations(file_paths):
    """
    Đọc và xử lý các file khuyến nghị từ nhiều mô hình
    """
    model_predictions = {}
    for model_name, file_path in file_paths.items():
        df = pd.read_csv(file_path, header=None)[[0,1,2,3]]
        # Chuyển đổi DataFrame thành dictionary với key là user_id và value là list các khuyến nghị
        predictions = {str(row[0]): list(row[1:]) for _, row in df.iterrows()}
        model_predictions[model_name] = predictions
    return model_predictions

def calculate_jaccard_similarity(set1, set2):
    """
    Tính toán độ tương đồng Jaccard giữa hai tập hợp
    """
    intersection = len(set(set1) & set(set2))
    union = len(set(set1) | set(set2))
    return intersection / union if union != 0 else 0

def analyze_model_agreement(model_predictions):
    """
    Phân tích độ đồng thuận giữa các mô hình
    """
    user_similarities = defaultdict(list)
    model_names = list(model_predictions.keys())
    
    # Với mỗi người dùng, tính toán độ tương đồng giữa các cặp mô hình
    for user_id in model_predictions[model_names[0]].keys():
        similarities = []
        # So sánh từng cặp mô hình
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                model1, model2 = model_names[i], model_names[j]
                recs1 = model_predictions[model1][user_id]
                recs2 = model_predictions[model2][user_id]
                similarity = calculate_jaccard_similarity(recs1, recs2)
                similarities.append(similarity)
        
        # Tính trung bình độ tương đồng cho người dùng này
        avg_similarity = np.mean(similarities)
        user_similarities[user_id] = avg_similarity
    
    return user_similarities

def get_extreme_cases(user_similarities, threshold_high=0.5, threshold_low=0.15 ):
    """
    Lấy ra các trường hợp có độ đồng thuận cao và thấp
    """
    high_agreement = {k: v for k, v in user_similarities.items() if v >= threshold_high}
    low_agreement = {k: v for k, v in user_similarities.items() if v <= threshold_low}
    
    return high_agreement, low_agreement

def analyze_recommendations():
    # Định nghĩa đường dẫn đến các file
    file_paths = {
        'RecVAE': "runs/private-test-attempt-1/RecVAE_0581.csv",
        'ALS': "runs/private-test-attempt-1/predict_ALS.csv",
        'LightGCN': "runs/private-test-attempt-1/LightGCN_predict.csv",
        'SAR': "runs/private-test-attempt-1/SAR_std_data.csv",
        'XSimGCL': "submission/predict_ensemble_XSim_multiview.csv"
    }
    
    # Đọc và xử lý dữ liệu
    model_predictions = load_and_process_recommendations(file_paths)
    
    # Phân tích độ đồng thuận
    user_similarities = analyze_model_agreement(model_predictions)
    
    # Lấy ra các trường hợp đặc biệt
    high_agreement, low_agreement = get_extreme_cases(user_similarities)
    
    # In kết quả phân tích
    print(f"Tổng số người dùng: {len(user_similarities)}")
    print(f"Số người dùng có độ đồng thuận cao: {len(high_agreement)}")
    print(f"Số người dùng có độ đồng thuận thấp: {len(low_agreement)}")
    
    # In ra một vài ví dụ
    print("\nVí dụ về người dùng có độ đồng thuận cao:")
    for user_id, similarity in list(high_agreement.items())[:5]:
        print(f"User {user_id}: {similarity:.3f}")
        
    print("\nVí dụ về người dùng có độ đồng thuận thấp:")
    for user_id, similarity in list(low_agreement.items())[:5]:
        print(f"User {user_id}: {similarity:.3f}")
        
    return high_agreement, low_agreement, user_similarities

# Chạy phân tích
high_agreement, low_agreement, user_similarities = analyze_recommendations()

Tổng số người dùng: 4225
Số người dùng có độ đồng thuận cao: 813
Số người dùng có độ đồng thuận thấp: 790

Ví dụ về người dùng có độ đồng thuận cao:
User 0a9Fr9I8Gr: 0.550
User gvOsLAX6V1: 0.550
User nWK4jN2q4O: 0.550
User AerQWfLe0S: 0.520
User rdJ2LyMYp4: 0.650

Ví dụ về người dùng có độ đồng thuận thấp:
User Uw0zXc8Sr8: 0.020
User NV1iw3vjY7: 0.040
User hGFPdBbiOm: 0.150
User PJ69bBhw4R: 0.140
User PXm9NYa9L1: 0.140


In [9]:
df = pd.read_csv("runs/private-test-attempt-1/predict_ensemble_7file.csv", header=None)

In [10]:
cnt = 0
full_rec = []
for i, c in zip(df.values, cl4rec.values):
    userid = i[0]
    try: 
        low_agreement[userid]
        rec_list = [0 for _ in range(1000)]
        cnt += 1
        # rec_list = c[1:1001]
    except:
        rec_list = i[1:1001]

    full_rec.append([userid, *rec_list])

cnt

790

In [11]:
!pwd

/workspace/viettel-ai-challenge-track-ds


In [12]:
low_agreement_user = low_agreement.keys()

with open("data/low_agreement_user.txt", "w") as f:
    for key in low_agreement_user:
        f.write(key + "\n")


In [302]:
low_purchase_user = low_user.groupby("UserId")["Purchase"].sum().where(lambda x : x >= 1).dropna().reset_index()["UserId"].unique()

In [372]:
len(low_purchase_user)

634

In [304]:
short_term_user = low_user.groupby("UserId")["Click"].sum().where(lambda x : x <= 15).dropna().reset_index()["UserId"].unique()

In [30]:
test = pd.read_csv('data/public_testset.csv', names=['user_id'] + [f'item_id_{i}' for i in range(1,1001)])

In [32]:
user_df = test.copy()
user_df = user_df.set_index('user_id')

In [34]:
train = pd.read_csv("data/training_set.csv")
# test = pd.read_csv("data/public_testset.csv")
list_all_train_item = train["ItemId"].unique()
item_columns = [f'item_id_{i+1}' for i in range(1000)] 
items_df = test[item_columns]
all_test_items = items_df.values.flatten()
all_test_items = set(all_test_items)
old_items = set(all_test_items).intersection(set(list_all_train_item))
new_items = all_test_items - old_items

In [41]:
new_items.intersection([])

set()

In [52]:
full_rec = []
cnt = 0
for i in df.values:
    userid = i[0]
    if userid in longtail_user and userid in low_agreement.keys():
        new_item_intersection = new_items.intersection(i[1:1001])
        if len (new_item_intersection) > 0:
            cnt+=1
            rec_list = [0 for _ in range(1000)]
        else:
            rec_list = i[1:1001]

    else:
        rec_list = i[1:1001]

    full_rec.append([userid, *rec_list])

cnt

45

In [45]:
# full_rec = []
# cnt = 0
# for i in (df.values):
#     userid = i[0]
#     if userid in short_term_user and userid in low_agreement.keys(): 
#         cnt+=1
#         rec_list = [0 for _ in range(1000)]
#     else:
#         rec_list = i[1:1001]

#     full_rec.append([userid, *rec_list])

# cnt

In [53]:
pd.DataFrame(full_rec).to_csv("submission/predict.csv", index=False, header=None)

In [54]:
!cd submission && zip predict_579_longtail20_new_item23.zip predict.csv

updating: predict.csv (deflated 27%)


In [316]:
pd.DataFrame(full_rec)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,NcVPv81tzp,mwOc6vrVPi,oNn8dJXhM8,NZ6ZRExaU5,3tljNC7rIT,av2irHMO5f,IzLVUKs7uQ,wvulby9Vbm,o9yXO6Ak9I,q4bgY20S3L,...,wuxyl7VhE9,3AogAjK3kY,vy9wUsasAg,38YRAfjTC4,brGYGGFX4e,ZAsaiLlrZq,zjiExpN9uw,CthVhk4IIv,cFfT0AEdOb,FqyjUN4Rhl
1,C5ihgxLcrb,6jy1DwPo7L,u0KbwAhFgw,pUUgxbyvTC,41uJKXzbXd,f1dROAFg11,7NudKUsc5y,WjZUWFn3s9,kk3smPWhZf,O5nMvWF9dA,...,kp3MxtQ2Bj,B3oqKT10l0,qOg25LQeVV,vf96p77kez,bAtq3ygsLw,061nstrHvn,e404cyyJnR,UZLFYQnmym,d7MfqB64c3,huMnOiuSv1
2,ljMBWT9UXW,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,WZXCQtOWJt,xbfyEc4bwt,dQoIYlX2wJ,V6S3fAvyI9,0CpYs5b1cE,SzPkpQ35GK,yDU5yTDtxq,rGAG9phqUn,joDpuUf5SE,10MawcxCHm,...,TUOyX6Edy4,d69h3eVoNt,p5obG12NGj,wA2r6w1Z3F,AUMgblChwY,DUzL3jV5WR,kKfwgMRbeU,h8CYmT2joP,mAfEd2XxBR,4pYokzc1Ph
4,xlB2rvpQKu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2812,51ANpugrpL,S4xaXwp7iP,bGMCLeABTX,xXY4iqSs37,9lTet1LDLe,wKxM43YVzt,TnqcEffbIJ,tGaEpRTh2s,iiLeV3DSP7,2zvu6ViuVZ,...,ZBdjT3PHzS,WaDzrvuPRb,7HO3tf28K1,3MkfN5HPet,R8W5S80njP,olP0Ak73wG,qOJUEIOsG6,uGpFICb6Nn,87zXiaBAvS,5O28cEDfvg
2813,PpBVsb2nMx,EYbGS5M7z4,pAqFb3i2ZL,1RI5B7JsfJ,HLFzBYKhhQ,ZpBhcOh6i5,VldjdED29h,Mrc8cAik5O,JA1LqZSMx0,H0AfknTqI1,...,gZuszMRTES,I3KoXH07rw,76DOysXkVr,qbTVHZab5Z,bcHDOfcXF0,yxt3NhA033,Si5wCvYVCs,qVcX7ETfmm,pc85TVkpUx,odr1y83ZBE
2814,EkulSxnt42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2815,eflbPkaBp9,QgcWlvxmht,QFwCT7ZYBs,LjtJh1Fyiy,Kg4f6NDLIi,lTthJz1zWD,loVD6FyD34,kNEK8EM56h,NECs6b672k,fVnwN1GPIu,...,xoDRbh3QJz,jb8LikQQZF,95fHCmeGPx,hI539tdri5,uN9CftRDCh,i89OirGsEp,wRmRleu7zR,g9TW5kbI1a,17luc0vqPa,oFhv4bnyWp


In [10]:
train = pd.read_csv("data/training_set.csv")

In [11]:
low_user = train[train["UserId"].isin(low_agreement.keys())]
high_user = train[train["UserId"].isin(high_agreement.keys())]

In [51]:
longtail_user = train.groupby("UserId")["Click"].sum().where(lambda x : x > 40).dropna()
longtail_user = longtail_user.reset_index()["UserId"].unique()
len(longtail_user)

588

In [326]:
train.groupby("UserId")["Click"].sum().describe()

count    36751.000000
mean        10.609861
std         39.119152
min          4.000000
25%          5.000000
50%          6.000000
75%          9.000000
max       3284.000000
Name: Click, dtype: float64

In [358]:
train.groupby("ItemId")["Click"].sum().describe()

count    83102.000000
mean         4.692101
std          8.814933
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max        318.000000
Name: Click, dtype: float64

In [370]:
train[train["ItemId"].isin(high_user["ItemId"].unique())]

Unnamed: 0,UserId,ItemId,Click,Purchase
1,tyviMi4b8Q,t3fB9Nq1VY,1,0
12,tyviMi4b8Q,lTYJaTlXyN,1,0
21,tyviMi4b8Q,zdIsHppepJ,1,0
28,PGcvoaV6Gn,qwhGdJ5ene,1,0
36,Sr1MiLP6VX,Sr1MiLP6VX,1,0
...,...,...,...,...
389896,PeFgD5uz6C,CBMrbcfqZn,1,0
389910,yWLJZiFVTI,zRc319plpU,1,0
389914,Uo2X1ah0bf,hRapAMLApl,1,0
389916,Uo2X1ah0bf,56em7Cl1uk,1,0


In [371]:
train[train["ItemId"].isin(low_user["ItemId"].unique())].groupby("ItemId")["Click"].sum()

ItemId
004POG3wLH    29
03qlefZg2e    11
06RPMFPiAy     9
07L6UFyWU3     8
07Tgy5RFQJ     1
              ..
zwvJdO5XjB    13
zxAOTuumc7    21
zySZ2Ho2Xb    42
zyrAuwQgjo    10
zzyfHckCgU    51
Name: Click, Length: 3132, dtype: int64

In [367]:
low_user

Unnamed: 0,UserId,ItemId,Click,Purchase
1400,Ymt7uwVeqU,j7UpWv5fqR,1,0
1401,Ymt7uwVeqU,CkX0wVME9I,1,0
1402,Ymt7uwVeqU,SpWGTzTwNe,1,0
1403,Ymt7uwVeqU,ASJ5HVYIM6,1,0
1404,Ymt7uwVeqU,CG7VYoDbcT,1,0
...,...,...,...,...
385869,y0lpxPO808,NI8zyOsvLu,1,0
385870,y0lpxPO808,DLjXPUKphu,1,0
385871,y0lpxPO808,mCmj0isT77,1,1
385872,y0lpxPO808,8FyKx0DUV2,1,1


In [355]:
len(train["ItemId"].unique())

83102

In [342]:
low_user.groupby("ItemId")["Click"].sum().describe()

count    15748.000000
mean         1.639383
std          1.481476
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         37.000000
Name: Click, dtype: float64

In [341]:
low_user.groupby("UserId")["Click"].sum().describe()

count     579.000000
mean       44.588946
std       143.838073
min         4.000000
25%         5.000000
50%         8.000000
75%        17.000000
max      1749.000000
Name: Click, dtype: float64

In [328]:
high_user.groupby("UserId")["Click"].sum().describe()

count    427.000000
mean       8.772834
std        6.781112
min        4.000000
25%        5.000000
50%        6.000000
75%       10.000000
max       65.000000
Name: Click, dtype: float64

In [329]:
train.groupby("UserId")["Purchase"].sum().describe()

count    36751.000000
mean         0.502027
std          6.026286
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        625.000000
Name: Purchase, dtype: float64

In [330]:
low_user.groupby("UserId")["Purchase"].sum().describe()

count    280.000000
mean       8.860714
std       32.150830
min        0.000000
25%        0.000000
50%        1.000000
75%        4.000000
max      363.000000
Name: Purchase, dtype: float64

In [331]:
train.groupby("UserId")["Purchase"].sum().where(lambda x : x >= 1).dropna()

UserId
016XSFFwy3     6.0
01mJsHlp6k     4.0
02C5uT9nyV     8.0
02GI5dVzbr    17.0
02zHgoN5Lu     1.0
              ... 
zvrvJhy0QH     2.0
zw0gw5D425     2.0
zxRPsj2hr4     1.0
zzReTyUNBb    10.0
zzkr2xyT3t     1.0
Name: Purchase, Length: 3486, dtype: float64

In [246]:
low_user.groupby("UserId")["Purchase"].sum().where(lambda x : x >= 1).dropna()

UserId
016XSFFwy3     6.0
02GI5dVzbr    17.0
06eM9m1yP8     7.0
0RM5hrGBcJ    22.0
0VPm4UWisk     4.0
              ... 
zaIdgaNzWm     1.0
zasreOz0e2     1.0
znIu7UJeiR     3.0
zpwkzDVsT7     2.0
zvrvJhy0QH     2.0
Name: Purchase, Length: 634, dtype: float64

In [237]:
high_user.groupby("UserId")["Purchase"].sum().describe()

count    427.000000
mean       0.955504
std        1.872807
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       13.000000
Name: Purchase, dtype: float64

In [156]:
print("Length user with purchase", len(low_user.groupby("UserId").agg({"Purchase": sum}).where(lambda x : x["Purchase"]>=1).dropna()))
print("Length user without purchase", len(low_user.groupby("UserId").agg({"Purchase": sum}).where(lambda x : x["Purchase"]<1).dropna()))

Length user with purchase 311
Length user without purchase 268


  print("Length user with purchase", len(low_user.groupby("UserId").agg({"Purchase": sum}).where(lambda x : x["Purchase"]>=1).dropna()))
  print("Length user without purchase", len(low_user.groupby("UserId").agg({"Purchase": sum}).where(lambda x : x["Purchase"]<1).dropna()))
