In [51]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import io
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def load_dst(file_path):
    """
    Parses a DST file and returns two DataFrames:
    1. df_interactions: User-Item visits (C and V lines)
    2. df_attributes: Page metadata (A lines)
    """
    interactions = []
    attributes = []
    
    current_user_id = None
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
                
            parts = line.strip().split(',')
            prefix = parts[0]
            
            # 1. Process Attributes (Metadata)
            if prefix == 'A':
                # Format: A, attr_id, ignore, title, url
                attributes.append({
                    'attr_id': int(parts[1]),
                    'title': parts[3].strip('"'),
                    'url': parts[4].strip('"')
                })
                
            # 2. Process Cases (User IDs)
            elif prefix == 'C':
                # Format: C, "user_id_str", user_id_int
                current_user_id = int(parts[2])
                
            # 3. Process Votes (The Interaction)
            elif prefix == 'V' and current_user_id is not None:
                # Format: V, attr_id, ignore
                interactions.append({
                    'case_id': current_user_id,
                    'attr_id': int(parts[1]),
                })
                
    # Convert lists to DataFrames
    df_interactions = pd.DataFrame(interactions)
    df_attributes = pd.DataFrame(attributes)
    
    return df_interactions, df_attributes

import pandas as pd

def convert_to_wide_dataframe(df, user_col='case_id', item_col='attr_id'):
    wide_df = pd.crosstab(df[user_col], df[item_col])
    return wide_df



In [None]:
df, attr_df = load_dst('anonymous-msweb.data')

In [47]:
df.head()

Unnamed: 0,case_id,attr_id
0,10001,1000
1,10001,1001
2,10001,1002
3,10002,1001
4,10002,1003


In [48]:
df = convert_to_wide_dataframe(df)

In [49]:
df

attr_id,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,...,1276,1277,1278,1279,1280,1281,1282,1283,1284,1295
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10002,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10003,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10004,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10005,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42707,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
42708,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
42709,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42710,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
user_similarity = cosine_similarity(df.values)
np.fill_diagonal(user_similarity, 0)



In [60]:
def recommend_ubcf(user_idx, k=20, top_n=10):
    sim = user_similarity[user_idx]
    neighbors = np.argsort(sim)[-k:]
    scores = sum(sim[n] * df.values[n] for n in neighbors)
    scores[df.values[user_idx] == 1] = -np.inf
    return df.columns[np.argsort(scores)[-top_n:][::-1]]
    
user_idx = 6
print(f"User: {df.index[user_idx]}")
print(f"\nUser-Based CF: {list(recommend_ubcf(user_idx, top_n=5))}")

User: 10007

User-Based CF: [1000, 1295, 1284, 1283, 1282]
