In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import scipy.stats as stt
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import os

In [2]:
def make_dummy_coded_df(df_nodes):
    """Efficient dummy encoding for a dataframe of categorical values."""
    df = df_nodes.copy()

    # Store all dummy columns in a dict first (avoids fragmentation)
    dummy_dict = {}

    for col in df.columns:
        values = df[col].unique()
        for value in values:
            name = f"{col}:{value}"
            dummy_dict[name] = (df[col] == value).astype(int)  # store as 0/1 ints

    # Concatenate once ‚Üí no fragmentation
    df_dummy = pd.concat(dummy_dict, axis=1)

    return df_dummy

def phi_(n11,n00,n10,n01):
    n1p = n11+n10
    n0p = n01+n00
    np1 = n01+n11
    np0 = n10+n00
    
    num = n11*n00-n10*n01
    den_ = n1p*n0p*np0*np1
    
    if den_==0:
        phi_=np.nan
    else:
        phi_ = num/np.sqrt(den_)
    return phi_

def p_val(r,L):
    den = np.sqrt(1-r**2)
    deg_free = L-2
    if den==0:
        p = 0
    else:
        num = r*np.sqrt(deg_free)
        t = num/den
        p = stt.t.sf(abs(t), df=deg_free)*2
    return p

def phi(x,y,get_p=False):
    
    m_eq = x==y
    m_diff = np.logical_not(m_eq)
    
    n11 = float(np.sum(x[m_eq]==True))
    n00 = float(np.sum(x[m_eq]==False))
    
    n10 = float(np.sum(x[m_diff]==True))
    n01 = float(np.sum(y[m_diff]==True))
    
    phi_val = phi_(n11,n00,n10,n01)
    
    if get_p:
        p = p_val(phi_val,len(x))
        return phi_val, p
    else:
        return phi_val


def make_graph_(df, list_of_nodes, alpha=0.05, get_p=True, remove_nan=False, remove_non_significant=False, exclude_same_question=True, print_=False):
    
    if get_p==False and remove_non_significant==True:
        print("Warning: Setting remove_non_significant to False as get_p is False!")
        remove_non_significant=False
    
    G = nx.Graph()
    
    count = 0
    # only select 30 nodes
    list_of_nodes = list_of_nodes # here you could select a subset of the nodes
    for i, node_i in enumerate(list_of_nodes):
        for j, node_j in enumerate(list_of_nodes):
            
            if j <= i: # do not run the same couple twice
                continue
            
            if print_:
                count += 1
                l = len(list_of_nodes)
                n_tot = l*(l-1)/2
                print(count,"/",n_tot, " = ", np.round(count/n_tot,decimals=2)*100, '%')
                
            basename1 = node_i.split(sep=':')[0]
            basename2 = node_j.split(sep=':')[0]
            
            if exclude_same_question:
                if basename1 == basename2: # if they belong to the same item
                    continue

            # Get the two columns
            c1 = df[node_i]
            c2 = df[node_j]
            
            if remove_nan:
                if ("Ref" in node_i) or ("Ref" in node_j):
                    continue
                
                c1_n = df[basename1+":nan"] # get the refused values of each item
                c2_n = df[basename2+":nan"]
                
                mask = np.logical_not(np.logical_or(c1_n, c2_n)) # get a mask of the refused values
                
                c1 = c1[mask] # select only the non-nan element
                c2 = c2[mask]
            
            if get_p:
                (r,p) = phi(c1,c2, get_p=True)
            else:
                r = phi(c1,c2, get_p=False)
            
            # Check if there are the conditions for drawing a node
            if remove_non_significant: 
                condition = r>0 and p<alpha
            else:
                condition = r>0

            if condition:
                G.add_weighted_edges_from([(node_i,node_j,r)],weight='weight')
                if get_p:
                    G.add_weighted_edges_from([(node_i,node_j,p)],weight='p')
                    sig = float(p<alpha) # Boolean are not accepted as edge weight
                    G.add_weighted_edges_from([(node_i,node_j,sig)],weight='sig')
    return G

def make_thermo_rep(df0):
    # 1Ô∏è‚É£  Detect the party/group column in df0
    party_candidates = [
        "member.group.short_label", "member.group.label", "member.group.code",
        "party_family", "party", "group_short", "group_label", "group_code",
    ]
    party_col = next((c for c in party_candidates if c in df0.columns), None)
    assert party_col is not None, "‚ùå No party/group column found in df0."

    # 2Ô∏è‚É£  Define ideology scores (‚àí1 = left ... +1 = right)
    party_axis = {
        # Far Left
        "GUE/NGL": -1.0,
        "GUE_NGL": -1.0,
        "Confederal Group of the European United Left - Nordic Green Left": -1.0,
        "The Left": -1.0,

        # Green Left
        "Greens/EFA": -0.8,
        "GREEN_EFA": -0.8,
        "Group of the Greens/European Free Alliance": -0.8,
        "Greens/European Free Alliance": -0.8,

        # Centre-Left
        "S&D": -0.6,
        "SD": -0.6,
        "Group of the Progressive Alliance of Socialists and Democrats in the European Parliament": -0.6,
        "Socialist Group in the European Parliament": -0.6,

        # Liberal / Centrist
        "Renew": -0.2,
        "RENEW": -0.2,
        "Renew Europe": -0.2,
        "Group of the Alliance of Liberals and Democrats for Europe": -0.2,
        "ALDE": -0.2,

        # Centre-Right (Christian Democrats)
        "EPP": 0.4,
        "European People‚Äôs Party": 0.4,
        "European People\u2019s Party": 0.4,
        "Group of the European People's Party (Christian Democrats)": 0.4,
        "Group of the European People's Party (Christian Democrats) and European Democrats": 0.4,
        "EPP-ED": 0.4,

        # Conservative / Right-wing
        "ECR": 0.7,
        "European Conservatives and Reformists": 0.7,
        "European Conservatives and Reformists Group": 0.7,

        # Right-wing populist / Nationalist
        "PFE": 0.9,
        "Patriots for Europe": 0.9,
        "ESN": 0.8,
        "Europe of Sovereign Nations": 0.8,

        # Far-right / Eurosceptic
        "ID": 1.0,
        "Identity & Democracy": 1.0,
        "Europe of Nations and Freedom Group": 1.0,
        "Europe of Freedom and Direct Democracy Group": 1.0,
        "Europe of freedom and democracy Group": 1.0,
        "Independence/Democracy Group": 1.0,
        "Union for Europe of the Nations Group": 0.8,

        # Non-attached
        "NI": 0.0,
        "Non-attached": 0.0,
        "Non-attached Members": 0.0,

        # Handle missing
        np.nan: 0.0
    }

    # 3Ô∏è‚É£  Align df0 with df_dummy (same MEP order)
    if df0.index.name == df_dummy.index.name and df0.index.isin(df_dummy.index).any():
        mep_party = df0[party_col]
    elif "mep_id" in df0.columns and df_dummy.index.name == "mep_id":
        mep_party = df0.set_index("mep_id")[party_col]
    else:
        mep_party = df0[party_col]

    common_ids = df_dummy.index.intersection(mep_party.index)
    X = df_dummy.loc[common_ids].fillna(False).astype(bool)
    parties = mep_party.loc[common_ids].astype(str)

    # 4Ô∏è‚É£  Build MEP-level heat vector (ideology score per MEP)
    mep_heat = parties.map(party_axis).astype(float)
    mep_heat = mep_heat.where(~mep_heat.isna(), 0.0)  # treat unknown as neutral (0.0)

    # 5Ô∏è‚É£  Define correlation helper (ignore NaNs)
    def corr_nan(x, y):
        """Compute Pearson r, ignoring NaN values."""
        x = np.asarray(x, dtype=float)
        y = np.asarray(y, dtype=float)
        mask = ~np.isnan(x) & ~np.isnan(y)
        if mask.sum() < 3:
            return (0.0, 1.0)
        r, p = stt.pearsonr(x[mask], y[mask])
        return (r, p)

    # 6Ô∏è‚É£  Compute correlation of each node‚Äôs endorsement pattern with ideology
    dic_r = {}
    type_ = "standard"  # "standard" for real r, "sign" for ¬±1 only

    for node in X.columns:
        col_node = X[node].astype(float)  # 0/1 per MEP
        r, p = corr_nan(col_node, mep_heat)
        dic_r[node] = np.sign(r) if type_ == "sign" else r

    # 7Ô∏è‚É£  Attach as node attribute in your network
    nx.set_node_attributes(G, dic_r, "ThermoRep_mean")

    # 8Ô∏è‚É£  Optional: inspect or save
    df_heat = pd.DataFrame.from_dict(dic_r, orient="index", columns=["ThermoRep"])
    #print(df_heat.head())

    # # # 9Ô∏è‚É£  Optional: visualize (red = left, blue = right)
    # import matplotlib.pyplot as plt
    # import matplotlib.colors as mcolors

    # norm = mcolors.Normalize(vmin=-1, vmax=1)
    # cmap = plt.cm.coolwarm
    # colors = [cmap(norm(df_heat.loc[n, "ThermoRep"])) if n in df_heat.index else (0.8, 0.8, 0.8) for n in G.nodes()]

    # plt.figure(figsize=(8, 8))
    # nx.draw(G, node_color=colors, node_size=40, with_labels=False)
    # plt.title("Left‚ÄìRight Heat (Red=Left, Blue=Right)")
    # plt.axis("off")
    # plt.show()

def get_x_y_coordinates(G):
    pos = nx.spring_layout(G,iterations=5000) # Get the positions with the spring layout

    # Restructure the data type
    pos2 = [[],[]]
    key_list = [] # ordered list of the nodes
    for key in pos:
        pos2[0].append(pos[key][0])
        pos2[1].append(pos[key][1])
        key_list.append(key)

    # Use PCA to rotate the network in such a way that the x-axis is the main one
    pos3 = []
    for key in pos:
        pos3.append([pos[key][0],pos[key][1]])

    pca = PCA(n_components=2)
    pca.fit(pos3)
    x_pca = pca.transform(pos3)

    # Get the x and y position of each node
    xx = x_pca[:,0]
    yy = x_pca[:,1]

    dict_term = nx.get_node_attributes(G,"ThermoRep_mean") # get the feeling thermo
    thermo = [dict_term[key] for key in key_list]
    stt.spearmanr(xx,thermo)
    return xx, yy

In [3]:
def build_party_networks(df0, df_dummy,
                         alpha=0.05,
                         get_p=True,
                         remove_non_significant=False,
                         remove_nan=False,
                         max_questions=200):
    """
    Builds and plots a network for EACH PARTY:
    - nodes = MEPs from that party
    - edges computed from dummy-coded votes (phi correlation)
    - node positions via spring_layout + PCA
    """

    # 1Ô∏è‚É£ Detect party column
    party_candidates = [
        "member.group.short_label", "member.group.label", "member.group.code",
        "party_family", "party", "group_short", "group_label", "group_code",
    ]
    party_col = next((c for c in party_candidates if c in df0.columns), None)
    assert party_col is not None, "‚ùå No party column found."

    # 2Ô∏è‚É£ Get list of parties
    parties = df0[party_col].dropna().unique()

    print(f"üé≠ Found {len(parties)} parties:", parties)

    # 3Ô∏è‚É£ Select vote items to reduce computation
    valid_suffixes = (":FOR", ":AGAINST", ":ABSTENTION")
    filtered_cols = [col for col in df_dummy.columns if col.endswith(valid_suffixes)]
    if len(filtered_cols) > max_questions:
        filtered_cols = random.sample(filtered_cols, max_questions)
    df_dummy = df_dummy[filtered_cols]

    # 4Ô∏è‚É£ Loop through parties
    for party in parties:
        print(f"\n=========================\nüü¶ Party: {party}\n=========================")

        # Select MEP IDs of this party
        party_ids = df0[df0[party_col] == party].index
        if len(party_ids) < 3:
            print("‚ö†Ô∏è Party too small, skipping.")
            continue

        # Restrict dummy votes to this party's MEPs
        df_dummy_party = df_dummy.loc[df_dummy.index.intersection(party_ids)]
        df0_party = df0.loc[df_dummy_party.index]

        print(f"‚Ä¢ MEPs in party: {len(df_dummy_party)}")

        # 5Ô∏è‚É£ Build the network for this party
        G = make_graph_(
            df=df_dummy_party,
            list_of_nodes=df_dummy_party.columns,
            alpha=alpha,
            get_p=get_p,
            remove_non_significant=remove_non_significant,
            remove_nan=remove_nan,
            exclude_same_question=False,
            print_=False
        )

        print("‚Ä¢ Graph built!")

        # 6Ô∏è‚É£ Compute ideological heat values for colouring
        #make_thermo_rep(df0_party)
        #print("‚Ä¢ Thermo completed!")

        # 7Ô∏è‚É£ Layout
        xx, yy = get_x_y_coordinates(G)

        # 8Ô∏è‚É£ Plot
        plt.figure(figsize=(9, 9))
        #thermo = nx.get_node_attributes(G, "ThermoRep_mean")
        #node_colors = [thermo.get(n, 0.0) for n in G.nodes()]

        plt.scatter(xx, yy, cmap="coolwarm", s=50, alpha=0.9)
        plt.title(f"Party Network ‚Äì {party}", fontsize=14, fontweight="bold")
        plt.axis("off")
        plt.tight_layout()
        plt.show()

        print(f"‚úîÔ∏è Party network plotted for {party}")

In [8]:
import pandas as pd
import numpy as np
import networkx as nx
import random
import matplotlib.pyplot as plt
import os

# YOUR SCHEMA
SCHEMA = {
    6: {"vote_id": "Vote ID", "policy": "main_policy_name", "member_id": "member.id", "country": "member.country.label"},
    7: {"vote_id": "Vote ID",      "policy": "De",               "member_id": "member.id", "country": "member.country.label"},
    8: {"vote_id": "Vote ID",      "policy": "De",               "member_id": "member.id", "country": "member.country.label"},
    9: {"vote_id": "id",           "policy": "policy_area",       "member_id": "member.id", "country": "member.country.code"},
    10:{"vote_id": "id",           "policy": "policy_area",       "member_id": "member.id", "country": "member.country.code"},
}


# ============================================================
# LOOP THROUGH ALL EPs
# ============================================================
for EP in [9,10]:

    print(f"\n\n============================")
    print(f"====== PROCESSING EP{EP} ======")
    print(f"============================")

    vote_id_col = SCHEMA[EP]["vote_id"]
    policy_col  = SCHEMA[EP]["policy"]
    member_col  = SCHEMA[EP]["member_id"]

    # -------------------------------
    # LOAD METADATA (topic/committee info)
    # -------------------------------
    meta_path = f"data/votewatch_csv/EP{EP}_Voted main docs.csv"
    if not os.path.exists(meta_path):
        print(f"‚ùå Missing metadata for EP{EP}, skipping.")
        continue

    meta = pd.read_csv(meta_path)

    # Normalize the policy/topic column
    if policy_col not in meta.columns:
        raise ValueError(f"‚ùå EP{EP}: policy column '{policy_col}' NOT FOUND in metadata.")

    meta[policy_col] = (
        meta[policy_col]
        .fillna("unknown")
        .astype(str)
        .str.lower()
        .str.strip()
    )

    # Fix vote id column
    if vote_id_col not in meta.columns:
        raise ValueError(f"‚ùå EP{EP}: vote ID column '{vote_id_col}' NOT FOUND in metadata.")

    meta[vote_id_col] = meta[vote_id_col].astype(str).str.replace(".0","", regex=False)
    id_to_topic = dict(zip(meta[vote_id_col], meta[policy_col]))

    topics = sorted(meta[policy_col].unique())
    print(f"Found {len(topics)} topics.")

    # -------------------------------
    # LOAD MEP √ó VOTE MATRIX
    # -------------------------------
    vote_matrix_path = f"data/all_votes_EP{EP}.csv"
    if not os.path.exists(vote_matrix_path):
        print(f"‚ùå Missing vote matrix for EP{EP}, skipping.")
        continue

    df_votes = pd.read_csv(vote_matrix_path, low_memory=False)

    # Identify vote columns
    vote_cols = [c for c in df_votes.columns if c.isdigit() or c.replace(".0","").isdigit()]
    vote_cols_clean = [c.replace(".0","") for c in vote_cols]

    # Rename to consistent ID format
    df_votes.rename(columns=dict(zip(vote_cols, vote_cols_clean)), inplace=True)

    print(f"Found {len(vote_cols_clean)} vote columns.")

    # -------------------------------
    # OUTPUT DIR
    # -------------------------------
    out_dir = f"topic_networks_EP{EP}"
    os.makedirs(out_dir, exist_ok=True)

    # -------------------------------
    # PER-TOPIC NETWORKS
    # -------------------------------
    for topic in topics:
        print(f"\n--- Topic: {topic} ---")

        # all vote IDs that correspond to this topic
        topic_votes = [vid for vid in vote_cols_clean if id_to_topic.get(vid) == topic]

        if len(topic_votes) == 0:
            print("No votes ‚Üí skip")
            continue

        # Subset vote matrix
        df_topic = df_votes[[member_col] + topic_votes].copy()

        # Drop MEPs with too many missing votes
        df_topic = df_topic.dropna(thresh=0.5*len(topic_votes), axis=0)
        if df_topic.shape[0] < 10:
            print("Too few valid MEPs ‚Üí skip")
            continue

        # dummy encode
        df_dummy = make_dummy_coded_df(df_topic[topic_votes])

        # Drop categories if too many
        cols = [c for c in df_dummy.columns if any(c.endswith(s) for s in [":FOR",":AGAINST",":ABSTENTION"])]
        if len(cols) > 200:
            cols = random.sample(cols, 200)
        df_dummy = df_dummy[cols]

        # Build graph
        G = make_graph_(
            df=df_dummy,
            list_of_nodes=df_dummy.columns,
            alpha=0.05,
            get_p=True,
            remove_non_significant=False,
            remove_nan=False,
            exclude_same_question=False,
            print_=False
        )

        print(f"Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

        # Save GEXF
        safe_topic = topic.replace(" ","_").replace("&","and").replace("/","_")
        gexf_path = f"{out_dir}/EP{EP}_{safe_topic}.gexf"
        nx.write_gexf(G, gexf_path)

        print(f"Saved: {gexf_path}")

    print(f"\n=== EP{EP} COMPLETE ===")

print("\n########### ALL EPs DONE ###########\n")



Found 24 topics.
Found 19272 vote columns.

--- Topic: agriculture and rural development ---
Graph: 114 nodes, 3107 edges
Saved: topic_networks_EP9/EP9_agriculture_and_rural_development.gexf

--- Topic: budgetary control ---
Graph: 200 nodes, 9620 edges
Saved: topic_networks_EP9/EP9_budgetary_control.gexf

--- Topic: budgets ---
Graph: 200 nodes, 9324 edges
Saved: topic_networks_EP9/EP9_budgets.gexf

--- Topic: civil liberties, justice and home affairs ---
Graph: 200 nodes, 9843 edges
Saved: topic_networks_EP9/EP9_civil_liberties,_justice_and_home_affairs.gexf

--- Topic: constitutional affairs ---
Graph: 134 nodes, 4277 edges
Saved: topic_networks_EP9/EP9_constitutional_affairs.gexf

--- Topic: culture and education ---
Graph: 90 nodes, 1969 edges
Saved: topic_networks_EP9/EP9_culture_and_education.gexf

--- Topic: development ---
Graph: 87 nodes, 1754 edges
Saved: topic_networks_EP9/EP9_development.gexf

--- Topic: economic and monetary affairs ---
Graph: 200 nodes, 9912 edges
Save

In [9]:
import glob
import networkx as nx
import matplotlib.pyplot as plt
import os

# ================================================================
# SETTINGS
# ================================================================
EPS = [7, 8, 9, 10]     # legislatures to process
BASE_DIR = "topic_networks_EP"  # folder structure

print("\n=== Starting rendering of all EP topic networks ===\n")

# ================================================================
# LOOP OVER EPS
# ================================================================
for EP_NUMBER in EPS:
    gexf_dir = f"{BASE_DIR}{EP_NUMBER}"
    
    if not os.path.exists(gexf_dir):
        print(f"Directory {gexf_dir} does not exist ‚Üí skipping EP{EP_NUMBER}.")
        continue
    
    gexf_files = sorted(glob.glob(f"{gexf_dir}/*.gexf"))
    print(f"\nEP{EP_NUMBER}: found {len(gexf_files)} GEXF files in {gexf_dir}")

    # ------------------------------------------------------------
    # PROCESS EACH FILE
    # ------------------------------------------------------------
    for gexf_path in gexf_files:
        print(f"  Loading: {gexf_path}")

        G = nx.read_gexf(gexf_path)

        # --- Layout (spring only ‚Äî no FA2) ---
        pos = nx.spring_layout(G, seed=42)

        # --- Node degree coloring ---
        degrees = dict(G.degree())
        node_colors = [degrees[n] for n in G.nodes]

        # --- Build figure ---
        plt.figure(figsize=(10, 10))
        nx.draw_networkx_edges(G, pos, alpha=0.15, width=0.6)

        nodes = nx.draw_networkx_nodes(
            G, pos,
            node_color=node_colors,
            cmap=plt.cm.viridis,
            node_size=40,
            alpha=0.9
        )

        # Extract topic name from filename
        fname = os.path.basename(gexf_path).replace(".gexf", "")
        fname_clean = fname.replace("_", " ")

        plt.title(f"EP{EP_NUMBER} ‚Äî {fname_clean}", fontsize=14)
        plt.axis("off")

        # Degree colorbar
        plt.colorbar(nodes, label="Node degree", shrink=0.7)

        plt.tight_layout()

        # Save PNG
        out_png = gexf_path.replace(".gexf", ".png")
        plt.savefig(out_png, dpi=200)
        print(f"  ‚Üí Saved plot: {out_png}")

        plt.close()

print("\n=== ALL DONE ===\n")


=== Starting rendering of all EP topic networks ===


EP7: found 23 GEXF files in topic_networks_EP7
  Loading: topic_networks_EP7/EP7_agriculture.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_agriculture.png
  Loading: topic_networks_EP7/EP7_budget.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_budget.png
  Loading: topic_networks_EP7/EP7_budgetary_control.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_budgetary_control.png
  Loading: topic_networks_EP7/EP7_civil_liberties,_justice_and_home_affairs.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_civil_liberties,_justice_and_home_affairs.png
  Loading: topic_networks_EP7/EP7_constitutional_and_inter-institutional_affairs.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_constitutional_and_inter-institutional_affairs.png
  Loading: topic_networks_EP7/EP7_culture_and_education.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_culture_and_education.png
  Loading: topic_networks_EP7/EP7_development.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_develo

  plt.colorbar(nodes, label="Node degree", shrink=0.7)


  ‚Üí Saved plot: topic_networks_EP7/EP7_gender_equality.png
  Loading: topic_networks_EP7/EP7_industry,_research_and_energy.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_industry,_research_and_energy.png
  Loading: topic_networks_EP7/EP7_internal_market_and_consumer_protection.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_internal_market_and_consumer_protection.png
  Loading: topic_networks_EP7/EP7_internal_regulations_of_the_ep.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_internal_regulations_of_the_ep.png
  Loading: topic_networks_EP7/EP7_international_trade.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_international_trade.png
  Loading: topic_networks_EP7/EP7_juridical_affairs.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_juridical_affairs.png
  Loading: topic_networks_EP7/EP7_legal_affairs.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_legal_affairs.png
  Loading: topic_networks_EP7/EP7_petitions.gexf
  ‚Üí Saved plot: topic_networks_EP7/EP7_petitions.png
  Loading: topic_networks

In [10]:
import os
import glob
from PIL import Image

EPs = [6, 7, 8, 9, 10]

out_dir = "topic_networks_ALL_horizontal"
os.makedirs(out_dir, exist_ok=True)

topic_to_pngs = {}

# --------------------------------------------------
# Step 1 ‚Äî scan PNGs inside topic_networks_EP*
# --------------------------------------------------
for EP in EPs:
    folder = f"topic_networks_EP{EP}"
    if not os.path.exists(folder):
        print(f"‚ö†Ô∏è Folder {folder} missing ‚Üí skipping EP{EP}")
        continue

    png_files = glob.glob(f"{folder}/EP{EP}_*.png")

    for path in png_files:
        file = os.path.basename(path)

        topic = (
            file.replace(f"EP{EP}_", "")
                .replace(".png", "")
                .strip()
        )

        topic = topic.replace(" ", "_").replace("&","and")

        if topic not in topic_to_pngs:
            topic_to_pngs[topic] = []

        topic_to_pngs[topic].append((EP, path))

# --------------------------------------------------
# Step 2 ‚Äî merge horizontally for each topic
# --------------------------------------------------
for topic, ep_png_list in topic_to_pngs.items():
    ep_png_list = sorted(ep_png_list, key=lambda x: x[0])  # sort by EP number

    print(f"\nüìå Merging topic horizontally: {topic}")
    for ep, path in ep_png_list:
        print(f"   EP{ep}: {path}")

    # Load images
    images = [Image.open(path) for _, path in ep_png_list]

    # Ensure consistent height
    heights = [img.height for img in images]
    max_height = max(heights)
    resized_images = [
        img.resize((int(img.width * max_height / img.height), max_height))
        if img.height != max_height else img
        for img in images
    ]

    # Total width for horizontal concatenation
    total_width = sum(img.width for img in resized_images)

    # Create blank canvas
    merged = Image.new("RGB", (total_width, max_height), color=(255, 255, 255))

    # Paste side-by-side
    x_offset = 0
    for img in resized_images:
        merged.paste(img, (x_offset, 0))
        x_offset += img.width

    # Save
    out_path = f"{out_dir}/{topic}_ALL_EP6_EP10_horizontal.png"
    merged.save(out_path)

    print(f"   ‚úÖ Saved ‚Üí {out_path}")

print("\nüéâ All topic horizontal panels created!")


üìå Merging topic horizontally: economic_and_monetary_affairs
   EP7: topic_networks_EP7/EP7_economic_and_monetary_affairs.png
   EP8: topic_networks_EP8/EP8_economic_and_monetary_affairs.png
   EP9: topic_networks_EP9/EP9_economic_and_monetary_affairs.png
   EP10: topic_networks_EP10/EP10_economic_and_monetary_affairs.png
   ‚úÖ Saved ‚Üí topic_networks_ALL_horizontal/economic_and_monetary_affairs_ALL_EP6_EP10_horizontal.png

üìå Merging topic horizontally: international_trade
   EP7: topic_networks_EP7/EP7_international_trade.png
   EP8: topic_networks_EP8/EP8_international_trade.png
   EP9: topic_networks_EP9/EP9_international_trade.png
   EP10: topic_networks_EP10/EP10_international_trade.png
   ‚úÖ Saved ‚Üí topic_networks_ALL_horizontal/international_trade_ALL_EP6_EP10_horizontal.png

üìå Merging topic horizontally: gender_equality
   EP7: topic_networks_EP7/EP7_gender_equality.png
   EP8: topic_networks_EP8/EP8_gender_equality.png
   EP10: topic_networks_EP10/EP10_gender_equ

In [33]:
import os
from PIL import Image

for topic, ep_png_list in topic_to_pngs.items():
    for ep, path in ep_png_list:
        print("Checking:", path, "‚Üí exists:", os.path.exists(path), "size:", os.path.getsize(path) if os.path.exists(path) else None)
        try:
            Image.open(path)
        except Exception as e:
            print("‚ùå ERROR FOR:", path)
            print("   TYPE:", type(e))
            print("   MESSAGE:", e)

Checking: topic_networks_EP6/EP6_budgetary_control.png ‚Üí exists: True size: 1311784
Checking: topic_networks_EP7/EP7_budgetary_control.png ‚Üí exists: True size: 1927138
Checking: topic_networks_EP8/EP8_budgetary_control.png ‚Üí exists: True size: 1572919
Checking: topic_networks_EP9/EP9_budgetary_control.png ‚Üí exists: True size: 2221190
Checking: topic_networks_EP10/EP10_budgetary_control.png ‚Üí exists: True size: 1402638
Checking: topic_networks_EP6/EP6_gender_equality.png ‚Üí exists: True size: 1719672
Checking: topic_networks_EP7/EP7_gender_equality.png ‚Üí exists: True size: 1505597
Checking: topic_networks_EP8/EP8_gender_equality.png ‚Üí exists: True size: 1677942
Checking: topic_networks_EP9/EP9_gender_equality.png ‚Üí exists: True size: 1546544
Checking: topic_networks_EP10/EP10_gender_equality.png ‚Üí exists: True size: 125057
Checking: topic_networks_EP6/EP6_agriculture.png ‚Üí exists: True size: 2139011
Checking: topic_networks_EP7/EP7_agriculture.png ‚Üí exists: True s