In [1]:
# Subgroup Discovery on Genre Networks
# By Gabriel P. Oliveira and Iago A. D. Vaz

In [2]:
import pandas as pd
import pysubgroup as ps
import numpy as np

In [3]:
dataset_path = '../1. Collaboration Profiles in Music Genre Networks [@ ISMIR 2020]/dataset'

In [4]:
market = 'us'
year = 2017

In [5]:
# Getting collaboration network
network = pd.read_csv(f'{dataset_path}/reduced_genre_network/{market}/{market}-reduced_genre_network-{year}.csv', sep=',', encoding='utf-8')
network.drop(['Avg_Popularity'], axis=1, inplace=True)
network.head()

Unnamed: 0,Source,Target,Weight,Avg_Streams
0,hip hop,rap,850,20477020.0
1,rap,trap,744,22316180.0
2,pop rap,rap,743,19394010.0
3,hip hop,trap,683,18930900.0
4,hip hop,pop rap,668,18494350.0


In [6]:
# Getting collaboration profiles for each edge
profiles = pd.read_csv(f'{dataset_path}/reduced_genre_network/{market}/db_clusters/{market}_cluster_{year}.csv', sep='\t', encoding='utf-8')
profiles = profiles[['Source', 'Target', 'final.cluster']]
profiles['final.cluster'].replace({0: 'solid', 1: 'regular', 2: 'bridge', 4: 'emerging'}, inplace=True)
profiles.rename({'final.cluster': 'profile'}, axis=1, inplace=True)

profiles.set_index(['Source', 'Target'], inplace=True)
profiles.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,profile
Source,Target,Unnamed: 2_level_1
hip hop,rap,solid
hip hop,trap,solid
hip hop,pop rap,solid
hip hop,hip hop,solid
hip hop,pop,solid


In [7]:
# Adding collaboration profile in genre network
network['profile'] = np.nan

for idx, row in network.iterrows():
    if (row['Source'], row['Target']) in profiles.index:
        index = (row['Source'], row['Target'])
        network.loc[idx, 'profile'] = profiles.loc[index, 'profile']
    elif (row['Target'], row['Source']) in profiles.index:
        index = (row['Target'], row['Source'])
        network.loc[idx, 'profile'] = profiles.loc[index, 'profile']

network.head()

Unnamed: 0,Source,Target,Weight,Avg_Streams,profile
0,hip hop,rap,850,20477020.0,solid
1,rap,trap,744,22316180.0,solid
2,pop rap,rap,743,19394010.0,solid
3,hip hop,trap,683,18930900.0,solid
4,hip hop,pop rap,668,18494350.0,solid


In [8]:
network['profile'].isna().sum()

0

In [9]:
# Getting node metrics
node_metrics = pd.read_csv(f'{dataset_path}/reduced_genre_network/{market}/reduced/nodes-{year}.csv', sep='\t', encoding='utf-8')
node_metrics.head()

Unnamed: 0,Id,Label,degree,weighted degree,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Authority,Hub,modularity_class,Clustering Coefficient,pageranks,componentnumber,clustering,triangles,eigencentrality
0,hip hop,hip hop,50,3237,3,0.728155,0.826667,0.107151,0.245812,0.245811,1,0.302041,0.119936,0,0.302041,370,0.947084
1,rap,rap,46,3375,3,0.700935,0.8,0.092395,0.238849,0.238848,1,0.340097,0.123201,0,0.340097,352,0.918782
2,trap,trap,38,2786,3,0.652174,0.746667,0.033134,0.22381,0.223809,1,0.456615,0.09732,0,0.456615,321,0.859459
3,pop rap,pop rap,44,3004,3,0.688073,0.786667,0.060031,0.237055,0.237054,1,0.368922,0.108106,0,0.368922,349,0.911238
4,pop,pop,58,2289,2,0.815217,0.886667,0.270704,0.258786,0.258784,0,0.242589,0.102472,0,0.242589,401,1.0


In [10]:
# GENRE/NODE ATTRIBUTES
# Degree
# Weighted degree
# Closeness
# Betweenness
# Authority
# Hub
# Clustering Coefficient
# Pageranks

node_metrics = node_metrics[['Label', 'degree', 'weighted degree', 'closnesscentrality', 
                            'betweenesscentrality', 'Authority', 'pageranks', 'clustering']]

node_metrics.rename({'Label': 'genre', 
                     'closnesscentrality': 'closeness', 
                     'betweenesscentrality': 'betweenness', 
                     'Authority': 'authority',
                     'pageranks': 'pagerank'}, axis=1, inplace=True)

In [11]:
node_metrics.quantile([0, 0.5, 1])

Unnamed: 0,degree,weighted degree,closeness,betweenness,authority,pagerank,clustering
0.0,1.0,1.0,0.328947,0.0,0.001401,0.002005,0.0
0.5,9.0,18.5,0.5,0.00029,0.074609,0.003654,0.824124
1.0,58.0,3375.0,0.815217,0.270704,0.258786,0.123201,1.0


In [12]:
# Putting node metrics into bins
attributes = ['degree', 'weighted degree', 'closeness', 'betweenness', 'authority', 'pagerank', 'clustering']

for att in attributes:
    node_metrics[att] = pd.qcut(node_metrics[att], q=[0, 0.5, 1], labels=['low', 'high'])
    #node_metrics[att] = pd.qcut(node_metrics[att], q=[0, 0.3, 0.8, 1], labels=['low', 'medium', 'high'])

node_metrics.set_index('genre', inplace=True)
node_metrics.head()

Unnamed: 0_level_0,degree,weighted degree,closeness,betweenness,authority,pagerank,clustering
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
hip hop,high,high,high,high,high,high,low
rap,high,high,high,high,high,high,low
trap,high,high,high,high,high,high,low
pop rap,high,high,high,high,high,high,low
pop,high,high,high,high,high,high,low


In [13]:
# Getting node metrics for node 1
network = network.join(node_metrics, on='Source', how='left')
network.rename({'degree': 'degree_1', 
                'weighted degree': 'wdegree_1', 
                'closeness': 'closeness_1', 
                'betweenness': 'betweenness_1', 
                'authority': 'authority_1',
                'hub': 'hub_1', 
                'pagerank': 'pagerank_1',
                'clustering': 'clustering_1'}, axis=1, inplace=True)
network.head()

Unnamed: 0,Source,Target,Weight,Avg_Streams,profile,degree_1,wdegree_1,closeness_1,betweenness_1,authority_1,pagerank_1,clustering_1
0,hip hop,rap,850,20477020.0,solid,high,high,high,high,high,high,low
1,rap,trap,744,22316180.0,solid,high,high,high,high,high,high,low
2,pop rap,rap,743,19394010.0,solid,high,high,high,high,high,high,low
3,hip hop,trap,683,18930900.0,solid,high,high,high,high,high,high,low
4,hip hop,pop rap,668,18494350.0,solid,high,high,high,high,high,high,low


In [14]:
# Getting node metrics for node 2
network = network.join(node_metrics, on='Target', how='left')
network.rename({'degree': 'degree_2', 
                'weighted degree': 'wdegree_2', 
                'closeness': 'closeness_2', 
                'betweenness': 'betweenness_2', 
                'authority': 'authority_2',
                'hub': 'hub_2', 
                'pagerank': 'pagerank_2',
                'clustering': 'clustering_2'}, axis=1, inplace=True)
network.head()

Unnamed: 0,Source,Target,Weight,Avg_Streams,profile,degree_1,wdegree_1,closeness_1,betweenness_1,authority_1,pagerank_1,clustering_1,degree_2,wdegree_2,closeness_2,betweenness_2,authority_2,pagerank_2,clustering_2
0,hip hop,rap,850,20477020.0,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
1,rap,trap,744,22316180.0,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
2,pop rap,rap,743,19394010.0,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
3,hip hop,trap,683,18930900.0,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
4,hip hop,pop rap,668,18494350.0,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low


In [15]:
# Running Subgroup Discovery

target = ps.NumericTarget('Avg_Streams')

searchspace = ps.create_selectors(network, ignore=['Source', 'Target', 'Weight', 'Avg_Streams', 'wdegree_1', 'wdegree_2'])
task = ps.SubgroupDiscoveryTask(
    network,
    target,
    searchspace,
    result_set_size=5, # Number of subgroups returned
    depth=15, # Maximum amount of attributes
    qf=ps.StandardQFNumeric(a=1.0))

result = ps.BeamSearch().execute(task)
result_df = result.to_dataframe()

In [16]:
subgroups = []
for _, row in result_df.iterrows():
    # Print only subgroups with attributes on both nodes
    if '_1' in row.subgroup and '_2' in row.subgroup:
        subgroups.append(row.subgroup)
        print(f'{row.subgroup}')

closeness_2=='high' AND degree_1=='high'
authority_1=='high' AND closeness_2=='high' AND degree_1=='high'
authority_1=='high' AND closeness_2=='high'
closeness_1=='high' AND closeness_2=='high'
closeness_2=='high' AND degree_1=='high' AND degree_2=='high'


In [17]:
# Getting metrics on subgroups
result_df[result_df['subgroup'].isin(subgroups)]

Unnamed: 0,quality,subgroup,size_sg,size_dataset,mean_sg,mean_dataset,std_sg,std_dataset,median_sg,median_dataset,max_sg,max_dataset,min_sg,min_dataset,mean_lift,median_lift
0,1226480000.0,closeness_2=='high' AND degree_1=='high',381.0,542.0,20616040.0,17396930.0,16638310.0,17402950.0,19757140.0,13376893.5,99862408.0,156295891.0,1161968.0,1161968.0,1.185039,1.476961
1,1226480000.0,authority_1=='high' AND closeness_2=='high' AN...,381.0,542.0,20616040.0,17396930.0,16638310.0,17402950.0,19757140.0,13376893.5,99862408.0,156295891.0,1161968.0,1161968.0,1.185039,1.476961
2,1200762000.0,authority_1=='high' AND closeness_2=='high',385.0,542.0,20515800.0,17396930.0,16580540.0,17402950.0,19729950.0,13376893.5,99862408.0,156295891.0,1161968.0,1161968.0,1.179277,1.474928
3,1064290000.0,closeness_1=='high' AND closeness_2=='high',355.0,542.0,20394930.0,17396930.0,17293830.0,17402950.0,18971970.0,13376893.5,99862408.0,156295891.0,1161968.0,1161968.0,1.172329,1.418264
4,1063071000.0,closeness_2=='high' AND degree_1=='high' AND d...,374.0,542.0,20239370.0,17396930.0,16211560.0,17402950.0,19757140.0,13376893.5,99862408.0,156295891.0,1161968.0,1161968.0,1.163387,1.476961


In [18]:
# Checking each subgroup
df = network[(network['closeness_2']=='high')
       & (network['degree_1']=='high')].sort_values(by=['Weight', 'Avg_Streams'], ascending=False)

print('Nodes:', len(set(list(df['Source']) + list(df['Target']))))
print('Edges:', len(df))

Nodes: 38
Edges: 381


In [19]:
df['profile'].value_counts()

regular    338
solid       43
Name: profile, dtype: int64

In [20]:
df

Unnamed: 0,Source,Target,Weight,Avg_Streams,profile,degree_1,wdegree_1,closeness_1,betweenness_1,authority_1,pagerank_1,clustering_1,degree_2,wdegree_2,closeness_2,betweenness_2,authority_2,pagerank_2,clustering_2
0,hip hop,rap,850,2.047702e+07,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
1,rap,trap,744,2.231618e+07,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
2,pop rap,rap,743,1.939401e+07,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
3,hip hop,trap,683,1.893090e+07,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
4,hip hop,pop rap,668,1.849435e+07,solid,high,high,high,high,high,high,low,high,high,high,high,high,high,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,dubstep,electropop,1,1.267522e+06,regular,high,high,high,low,high,low,high,high,high,high,high,high,high,low
374,dance,indie,1,1.174890e+06,regular,high,high,high,high,high,high,high,high,high,high,high,high,high,low
413,electro,new wave,1,1.161968e+06,regular,high,high,high,high,high,high,low,high,high,high,high,high,high,low
414,electro house,new wave,1,1.161968e+06,regular,high,high,high,high,high,high,low,high,high,high,high,high,high,low
