In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load user-artist interactions
user_artists = pd.read_csv(
    "/content/user_artists.dat",
    sep="\t",
    names=["user_id", "artist_id", "playcount"],
    header=0
)

# Load artist metadata
artists = pd.read_csv(
    "/content/artists.dat",
    sep="\t",
    names=["artist_id", "name", "url", "picture_url"],
    header=0
)

print("User-artist rows:", len(user_artists))
print("Artists:", len(artists))


User-artist rows: 92834
Artists: 17632


In [3]:
def get_user_profile(user_id, top_k=10):
    df = user_artists[user_artists["user_id"] == user_id]
    df = df.merge(artists, on="artist_id")
    df = df.sort_values("playcount", ascending=False)
    return df[["artist_id", "name", "playcount"]].head(top_k)

# Pick a user to inspect
sample_user = user_artists["user_id"].iloc[0]
get_user_profile(sample_user)


Unnamed: 0,artist_id,name,playcount
0,51,Duran Duran,13883
1,52,Morcheeba,11690
2,53,Air,11351
3,54,Hooverphonic,10300
4,55,Kylie Minogue,8983
5,56,Daft Punk,6152
6,57,Thievery Corporation,5955
7,58,Goldfrapp,4616
8,59,New Order,4337
9,60,Matt Bianco,4147


In [4]:
all_artist_ids = user_artists["artist_id"].unique()
artist_index = {a: i for i, a in enumerate(all_artist_ids)}

def build_identity_vector(user_id):
    vec = np.zeros(len(all_artist_ids))
    df = user_artists[user_artists["user_id"] == user_id]

    total = df["playcount"].sum()
    for _, row in df.iterrows():
        vec[artist_index[row["artist_id"]]] = row["playcount"] / total

    return vec


In [5]:
users = user_artists["user_id"].unique()
user_index = {u: i for i, u in enumerate(users)}

def build_user_matrix():
    mat = np.zeros((len(users), len(all_artist_ids)))

    for _, row in user_artists.iterrows():
        ui = user_index[row["user_id"]]
        ai = artist_index[row["artist_id"]]
        mat[ui, ai] = row["playcount"]

    mat = mat / (mat.sum(axis=1, keepdims=True) + 1e-8)
    return mat

user_matrix = build_user_matrix()


In [6]:
def recommend_artists(user_id, top_k=10):
    u_idx = user_index[user_id]
    sims = cosine_similarity([user_matrix[u_idx]], user_matrix)[0]

    scores = sims @ user_matrix
    scores[u_idx] = 0  # don't recommend self

    top_indices = scores.argsort()[-top_k:][::-1]
    return all_artist_ids[top_indices]


In [7]:
def identity_drift(vec_before, vec_after):
    return 1 - cosine_similarity([vec_before], [vec_after])[0][0]

def recommendation_overlap(rec_before, rec_after):
    return len(set(rec_before).intersection(set(rec_after)))


In [10]:
def inject_relative_playcount(user_id, artist_id, fraction=0.2):
    global user_artists

    user_total = user_artists[
        user_artists["user_id"] == user_id
    ]["playcount"].sum()

    injection = int(user_total * fraction)

    mask = (
        (user_artists["user_id"] == user_id) &
        (user_artists["artist_id"] == artist_id)
    )

    if mask.any():
        user_artists.loc[mask, "playcount"] += injection
    else:
        user_artists = pd.concat(
            [user_artists, pd.DataFrame([{
                "user_id": user_id,
                "artist_id": artist_id,
                "playcount": injection
            }])],
            ignore_index=True
        )

    return injection


In [11]:
# Pick victim user
victim = sample_user

# Pick a target artist the user barely listens to
target_artist = user_artists[
    user_artists["user_id"] != victim
]["artist_id"].iloc[0]

# Baseline
id_before = build_identity_vector(victim)
rec_before = recommend_artists(victim)

# Attack (20% of user's total listening!)
injected = inject_relative_playcount(victim, target_artist, fraction=0.2)

# Recompute
user_matrix = build_user_matrix()
id_after = build_identity_vector(victim)
rec_after = recommend_artists(victim)

print("Injected plays:", injected)
print("Identity drift:", identity_drift(id_before, id_after))
print("Recommendation overlap:", recommendation_overlap(rec_before, rec_after))


Injected plays: 33751
Identity drift: 0.31740819242212726
Recommendation overlap: 9


In [12]:
def find_boundary_artists(user_id, top_n=5):
    u_idx = user_index[user_id]
    sims = cosine_similarity([user_matrix[u_idx]], user_matrix)[0]

    # Top similar users (excluding self)
    similar_users = sims.argsort()[-20:-1]

    candidate_scores = {}

    for su in similar_users:
        listened = user_matrix[su]
        for ai, val in enumerate(listened):
            if val > 0 and user_matrix[u_idx][ai] == 0:
                candidate_scores[ai] = candidate_scores.get(ai, 0) + val

    top_candidates = sorted(
        candidate_scores.items(),
        key=lambda x: x[1],
        reverse=True
    )[:top_n]

    return [all_artist_ids[i] for i, _ in top_candidates]


In [14]:
def boundary_injection_attack(user_id, injection_per_artist=50):
    candidates = find_boundary_artists(user_id)

    for artist_id in candidates:
        inject_relative_playcount(user_id, artist_id, injection_per_artist)

    return candidates


In [15]:
# Baseline
id_before = build_identity_vector(victim)
rec_before = recommend_artists(victim)

# Boundary attack
boundary_artists = boundary_injection_attack(victim, injection_per_artist=50)

# Recompute
user_matrix = build_user_matrix()
id_after = build_identity_vector(victim)
rec_after = recommend_artists(victim)

print("Boundary artists injected:", boundary_artists)
print("Identity drift:", identity_drift(id_before, id_after))
print("Recommendation overlap:", recommendation_overlap(rec_before, rec_after))


Boundary artists injected: [np.int64(2562), np.int64(511), np.int64(1001), np.int64(599), np.int64(159)]
Identity drift: 0.9999999993259195
Recommendation overlap: 2


In [16]:
print("Total injected plays:",
      5 * 50)  # num boundary artists Ã— injection_per_artist

print("Original total plays:",
      user_artists[user_artists["user_id"] == victim]["playcount"].sum())


Total injected plays: 250
Original total plays: 69870373529508
