# Finding users with the same taste in music

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('lastfm.csv')
df

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
...,...,...,...,...
289950,19718,bob dylan,f,Canada
289951,19718,pixies,f,Canada
289952,19718,the clash,f,Canada
289953,19718,a tribe called quest,f,Canada


In [3]:
# user-artist matrix
user_artist_matrix = pd.crosstab(df['user'], df['artist'])
user_similarity = cosine_similarity(user_artist_matrix)
user_sim_df = pd.DataFrame(user_similarity, index=user_artist_matrix.index, columns=user_artist_matrix.index)

In [4]:
user_sim_df

user,1,3,4,5,6,7,9,12,13,14,...,19708,19709,19710,19711,19712,19713,19714,19715,19717,19718
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.046424,0.048113,0.000000,0.000000,0.000000,0.000000,0.091287,0.000000,0.000000,...,0.069338,0.118585,0.0,0.041100,0.000000,0.000000,0.072169,0.000000,0.000000,0.000000
3,0.046424,1.000000,0.035737,0.000000,0.000000,0.000000,0.000000,0.000000,0.070186,0.000000,...,0.051503,0.000000,0.0,0.030528,0.000000,0.000000,0.000000,0.000000,0.000000,0.067806
4,0.048113,0.035737,1.000000,0.000000,0.120386,0.123091,0.000000,0.105409,0.072739,0.000000,...,0.053376,0.060858,0.0,0.126554,0.064150,0.055556,0.055556,0.000000,0.000000,0.105409
5,0.000000,0.000000,0.000000,1.000000,0.188608,0.128565,0.138343,0.110096,0.000000,0.000000,...,0.000000,0.047673,0.0,0.000000,0.000000,0.000000,0.087039,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.120386,0.188608,1.000000,0.177822,0.047836,0.152277,0.000000,0.000000,...,0.057831,0.164845,0.0,0.068559,0.000000,0.000000,0.060193,0.161515,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19713,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.160128,0.045644,0.0,0.094916,0.096225,1.000000,0.000000,0.074536,0.072169,0.000000
19714,0.072169,0.000000,0.055556,0.087039,0.060193,0.123091,0.132453,0.052705,0.000000,0.000000,...,0.000000,0.045644,0.0,0.047458,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
19715,0.000000,0.000000,0.000000,0.000000,0.161515,0.000000,0.000000,0.047140,0.000000,0.000000,...,0.143223,0.040825,0.0,0.042448,0.172133,0.074536,0.000000,1.000000,0.129099,0.047140
19717,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.208013,0.118585,0.0,0.000000,0.083333,0.072169,0.000000,0.129099,1.000000,0.000000


In [None]:
def analyze_similar_demographics(target_user_id, top_n=30):
    if target_user_id not in user_sim_df.index:
        return {"error": "User not found"}
    
    # get IDs of most similar users sorted by cosine similarity
    sim_scores = (
        user_sim_df.loc[target_user_id]
        .sort_values(ascending=False)
        .drop(target_user_id)
    )
    top_similar_ids = sim_scores.head(top_n).index
    
    # extract demographic info
    similar_users_info = (
        df[df['user'].isin(top_similar_ids)][['user', 'sex', 'country']]
        .drop_duplicates('user')
    )
    
    if similar_users_info.empty:
        return {"error": "No similar users found"}
    
    # sex distributions 
    sex_dist = similar_users_info['sex'].value_counts(normalize=True)

    # country full distribution
    full_country_dist = similar_users_info['country'].value_counts(normalize=True)
    
    if len(full_country_dist) > 4:
        top_4 = full_country_dist.iloc[:4]
        
        # sum the remaining countries into "Others"
        others_score = full_country_dist.iloc[4:].sum()
        others = pd.Series({'Others': others_score})

        country_dist = pd.concat([top_4, others])
    else:
        country_dist = full_country_dist

    return {
        "sex_distribution": sex_dist,
        "country_distribution": country_dist,
    }

In [48]:
analyze_similar_demographics(target_user_id=3)

{'sex_distribution': sex
 m    0.833333
 f    0.166667
 Name: proportion, dtype: float64,
 'country_distribution': United States         0.166667
 Russian Federation    0.100000
 Sweden                0.100000
 Belgium               0.100000
 Others                0.533333
 dtype: float64}

In [None]:
unique_users = df['user'].unique()
unique_users

array([    1,     3,     4, ..., 19715, 19717, 19718], shape=(15000,))

In [None]:
all_results = []

for user_id in unique_users:
    result = analyze_similar_demographics(user_id)
    
    if "error" in result:
        continue

    row_data = {'target_user_id': user_id}

    for sex_label, percentage in result['sex_distribution'].items():
        row_data[f"sex_{sex_label}"] = percentage
        
    for country_label, percentage in result['country_distribution'].items():
        row_data[f"country_{country_label}"] = percentage
        
    all_results.append(row_data)

final_df = pd.DataFrame(all_results).fillna(0)

output_filename = 'similar_users_analysis.csv'
final_df.to_csv(output_filename, index=False)