In [None]:
import json
from pathlib import Path
from collections import Counter, defaultdict
from itertools import combinations
import pandas as pd
import networkx as nx
import community as community_louvain

OUTPUT_DIR = Path('public/data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DECADES = list(range(1960, 2030, 10))

songs = pd.read_csv(
    'datadump/songs_no_dublicates.csv',
    usecols=['recording_mbid','first_release_year']
).dropna(subset=['first_release_year'])
songs = (
    songs
    .sort_values('first_release_year')
    .drop_duplicates('recording_mbid', keep='first')
)
songs['decade'] = (songs.first_release_year // 10) * 10

writes = pd.read_csv(
    'datadump/writerships.csv',
    usecols=['recording_mbid','writer_id']
)

artists_all = pd.read_csv(
    'datadump/artists_all.csv',
    usecols=['artist_mbid','name'],
    dtype=str
)
writer2name = artists_all.set_index('artist_mbid')['name'].to_dict()

wr = writes.merge(
    songs[['recording_mbid','decade']],
    on='recording_mbid', how='left'
)
wcnt = (
    wr
    .groupby(['writer_id','decade'])
    .size()
    .reset_index(name='count')
)
idx = wcnt.groupby('writer_id')['count'].idxmax()
writer_decade_map = (
    wcnt
    .loc[idx, ['writer_id','decade']]
    .set_index('writer_id')['decade']
    .to_dict()
)

def build_ww_graph_assigned_decade(decade_start):
    decade_end = decade_start + 9

    recs = set(
        songs.loc[
            songs.first_release_year.between(decade_start, decade_end),
            'recording_mbid'
        ]
    )
    valid_writers = { w for w,d in writer_decade_map.items() if d == decade_start }

    sub = writes[
        writes.recording_mbid.isin(recs) &
        writes.writer_id.isin(valid_writers)
    ]

    cnt = Counter()
    for _, grp in sub.groupby('recording_mbid'):
        ws = sorted(grp['writer_id'].unique())
        if len(ws) < 2:
            continue
        for w1, w2 in combinations(ws, 2):
            cnt[(w1, w2)] += 1

    G = nx.Graph()
    for (w1, w2), weight in cnt.items():
        G.add_edge(w1, w2, weight=weight)
    return G

for dec in DECADES:
    label = f"{dec}s"
    G = build_ww_graph_assigned_decade(dec)

    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    density = nx.density(G)

    partition = {}
    if n_edges > 0:
        partition = community_louvain.best_partition(G, weight='weight')

    comm2nodes = defaultdict(list)
    for wid, cid in partition.items():
        comm2nodes[cid].append(wid)

    print(f"\n=== Writer–Writer {label} ===")
    print(f"Nodes: {n_nodes}, Edges: {n_edges}, Density: {density:.4f}")
    print(f"Communities: {len(comm2nodes)}")
    for cid, members in comm2nodes.items():
        # show first 10 names
        names = [ writer2name.get(w, w) for w in members ]
        sample = names if len(names) <= 10 else names[:10] + ['…']
        print(f" • Community {cid} ({len(names)} writers): {sample}")

    nodes = []
    for wid in G.nodes():
        nodes.append({
            "id":        wid,
            "name":      writer2name.get(wid, wid),
            "value":     G.degree(wid, weight='weight'),
            "community": partition.get(wid, 0),
            "genre":     []   # placeholder
        })

    links = [
        { "source": u, "target": v, "value": d["weight"] }
        for u, v, d in G.edges(data=True)
    ]

    out = {"nodes": nodes, "links": links}
    fn = OUTPUT_DIR / f"writer-network-assigned-{label}.json"
    with open(fn, 'w') as fp:
        json.dump(out, fp, indent=2)

    print(f"Wrote JSON → {fn} ({len(nodes)} nodes, {len(links)} links)")


1960s: 4851 noder  |   9476 kanter
1970s: 3443 noder  |   6682 kanter
1980s: 3047 noder  |   6615 kanter
1990s: 6174 noder  |  27335 kanter
2000s: 4700 noder  |  20965 kanter
2010s: 2584 noder  |  10215 kanter
2020s:  127 noder  |    194 kanter


In [None]:
import networkx as nx
import numpy as np
import pandas as pd
from collections import Counter

def gini(array):
    """Gini coefficient for inequality of degree distribution."""
    if len(array) == 0:
        return np.nan
    array = np.sort(np.array(array, dtype=float))
    n = len(array)
    cum = np.cumsum(array)
    return (n + 1 - 2 * cum.sum() / cum[-1]) / n

def safe_metric(func, *args, **kwargs):
    """Return NaN instead of blowing up on disconnected graphs."""
    try:
        return func(*args, **kwargs)
    except (nx.NetworkXError, ZeroDivisionError):
        return np.nan

def decade_metrics(G, top_n=5):
    """
    Compute a dictionary of graph statistics that are easy to interpret
    in a social-science write-up.
    """
    n = G.number_of_nodes()
    m = G.number_of_edges()
    deg   = dict(G.degree())
    deg_w = dict(G.degree(weight="weight"))
    degrees = np.array(list(deg.values()))

    if n > 0:
        GCC = max(nx.connected_components(G), key=len)
        sub = G.subgraph(GCC)
    else:
        sub = G

    return {
        "nodes"             : n,
        "edges"             : m,
        "avg_degree"        : degrees.mean()          if n else np.nan,
        "median_degree"     : np.median(degrees)      if n else np.nan,
        "degree_gini"       : gini(degrees),          # inequality measure
        "density"           : nx.density(G),
        "avg_clustering"    : safe_metric(nx.average_clustering, G, weight=None),
        "assortativity_deg" : safe_metric(nx.degree_assortativity_coefficient, G),
        "avg_path_length"   : safe_metric(nx.average_shortest_path_length, sub),
        "diameter"          : safe_metric(nx.diameter, sub),
        **{
            "n_communities": len(comms := list(nx.algorithms.community.greedy_modularity_communities(G))),
            "modularity"   : nx.algorithms.community.modularity(G, comms)
        },
        "top_writers"       : ", ".join(
            f"{wid}:{int(deg_w[wid])}"
            for wid, _ in Counter(deg_w).most_common(top_n)
        )
    }

records = []
for decade, G in writer_graphs.items():
    rec = decade_metrics(G)
    rec["decade"] = decade
    records.append(rec)

metrics_df = (pd.DataFrame(records)
                .sort_values("decade")
                .set_index("decade")
                .round(3))

display(metrics_df)          
metrics_df.to_csv("decade_network_metrics_2.csv")

Unnamed: 0_level_0,nodes,edges,avg_degree,median_degree,degree_gini,density,avg_clustering,assortativity_deg,avg_path_length,diameter,n_communities,modularity,top_writers
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1960s,279,188,1.348,1.0,0.204,0.005,0.231,0.792,1.333,2,122,0.979,"50986f3f-f7f3-42bb-b14f-17e8c6bab222:19, 9b142..."
1970s,425,347,1.633,1.0,0.308,0.004,0.272,0.844,2.475,5,169,0.95,"227c88c7-ea23-49fd-8cc7-f737754ad26d:47, 4f079..."
1980s,497,453,1.823,1.0,0.306,0.004,0.406,0.811,1.5,2,186,0.952,"266cd37e-4198-4b33-a20b-37994a25b2d8:84, 2b09f..."
1990s,2455,3512,2.861,2.0,0.465,0.001,0.421,0.283,7.668,21,581,0.879,"7f347782-eb14-40c3-98e2-17b6e1bfe56c:3876, a94..."
2000s,4236,9590,4.528,2.0,0.55,0.001,0.511,0.022,5.564,16,597,0.812,"4d5447d7-c61c-4120-ba1b-d7f471d385b9:3826, ba5..."
2010s,5072,17092,6.74,3.0,0.583,0.001,0.615,-0.016,4.2,13,609,0.638,"6622a50e-5206-44d0-9ad5-96047a726dcf:2622, 186..."
2020s,2479,8259,6.663,4.0,0.519,0.003,0.71,-0.003,4.171,11,203,0.656,"9032140b-c6ba-448d-8f05-5ac16e3aa165:1015, 115..."
