# In this file we start with creating MEP resin networks for GEPHI visualizations

In [None]:
import numpy as np
import pandas as pd

# -------------------------------------------------------------
# SIMILARITY MATRIX (1, -1, -0.25, 0.25 RULES)
# -------------------------------------------------------------

def similarity_matrix(vote_df):
    """Compute similarity matrix from vote dataframe."""
    
    # vote columns are numeric
    vote_cols = [c for c in vote_df.columns if c.isdigit()]

    mapping = {
        "FOR": 1,
        "AGAINST": -1,
        "ABSTENTION": 0,
        "DID_NOT_VOTE": np.nan,
        "DID_NO": np.nan,
        "DID_NOT_VOTE ": np.nan
    }

    # convert to numeric codes
    V = vote_df[vote_cols].replace(mapping).to_numpy(dtype=float)
    n = V.shape[0]

    yes  = (V == 1).astype(float)
    no   = (V == -1).astype(float)
    abst = (V == 0).astype(float)

    S = np.zeros((n, n))
    counts = np.zeros((n, n))

    # AGREEMENT (YES-YES or NO-NO) → +1
    S += yes @ yes.T + no @ no.T
    counts += yes @ yes.T + no @ no.T

    # DISAGREEMENT (YES-NO / NO-YES) → -1
    neg_pairs = yes @ no.T + no @ yes.T
    S -= neg_pairs
    counts += neg_pairs

    # MIXED WITH ABSTENTION → -0.25
    mixed = yes @ abst.T + abst @ yes.T + no @ abst.T + abst @ no.T
    S += -0.25 * mixed
    counts += mixed

    # ABST-ABST → +0.25
    abst_pairs = abst @ abst.T
    S += 0.25 * abst_pairs
    counts += abst_pairs

    # Normalize S (similarity)
    S = np.divide(S, counts, out=np.zeros_like(S), where=counts > 0)

    # Convert [-1,1] → [0,1]
    A = 0.5 * S + 0.5

    # diagonal = 1
    np.fill_diagonal(A, 1)

    return A


# -------------------------------------------------------------
# LOAD DATA AND BUILD FULL CSV WITH METADATA + SIM MATRIX
# -------------------------------------------------------------
ep = 6
df = pd.read_csv(f"data/all_votes_main_EP{ep}.csv")
# Count number of times each 'member.id' appears and print ones that are not unique
id_counts = df["member.id"].value_counts()
non_unique_ids = id_counts[id_counts > 1]
if not non_unique_ids.empty:
    print("Non-unique member IDs found:")
    print(non_unique_ids)
else:
    print("All member IDs are unique.")


# compute similarity matrix
A = similarity_matrix(df)

# vector of MEP IDs, used to name columns
mep_ids = df["member.id"].astype(str).tolist()

vote_cols = [c for c in df.columns if c.isdigit()]

meta_df = pd.DataFrame({
    "member.id": df["member.id"],
    #"first_name": df["member.first_name"],
    #"last_name": df["member.last_name"],
    "party": df["member.group.short_label"],
    "percentage_votes": df[vote_cols].isin(["FOR", "AGAINST", "ABSTENTION"]).sum(axis=1) / len(vote_cols)
})

# Build similarity dataframe with columns named by MEP ID
sim_df = pd.DataFrame(A, columns=[f"sim_with_{mid}" for mid in mep_ids])

# Combine metadata + similarity matrix
final_df = pd.concat([meta_df, sim_df], axis=1)

# Save CSV with float formatting
final_df.to_csv(f"similarity_matrices/sim_mat_EP{ep}.csv", index=False, float_format="%.4f")

print(f"Saved: sim_mat_EP{ep}.csv")
print(final_df.head())
print(final_df.shape)

All member IDs are unique.
Saved: sim_mat_EP6.csv
   member.id                                              party   
0      28469  Confederal Group of the European United Left -...  \
1      28302                       Independence/Democracy Group   
2      28975  Confederal Group of the European United Left -...   
3      28367  Group of the European People's Party (Christia...   
4      28512                               Non-attached Members   

   percentage_votes  sim_with_28469  sim_with_28302  sim_with_28975   
0          0.736632        1.000000        0.604452        0.848684  \
1          0.067039        0.604452        1.000000        0.589552   
2          0.786911        0.848684        0.589552        1.000000   
3          0.829210        0.653894        0.441964        0.714910   
4          0.824421        0.411918        0.664773        0.406753   

   sim_with_28367  sim_with_28512  sim_with_28246  sim_with_2107  ...   
0        0.653894        0.411918        0.6507

In [18]:
import pandas as pd
import networkx as nx
import numpy as np
import re
import glob
import os


# -----------------------------
# Extract ID number from column
# -----------------------------
def extract_id(col_name):
    # turns "sim_with_840" → 840
    id = col_name.replace("sim_with_", "")
    # if it is not an int, return float
    if not id.isdigit():
        return float(id)
    return int(id)


# -----------------------------
# Build graph from CSV file
# -----------------------------
def build_mep_network_from_csv(csv_path, sim_threshold=0.0):
    df = pd.read_csv(csv_path)
    # only keep rows with percentage_votes > 0.5
    df = df[df["percentage_votes"] > 0.9]

    # all similarity columns
    sim_cols = [c for c in df.columns if c.startswith("sim_with_")]

    # extract MEP IDs for columns
    col_ids = [extract_id(c) for c in sim_cols]


    # graph
    G = nx.Graph()

    # ---- Add nodes with attributes ----
    for _, row in df.iterrows():
        mep_id = int(row["member.id"])
        G.add_node(
            mep_id,
            party=row["party"],
            percentage_votes=row["percentage_votes"]
        )

    # ---- Add edges from similarity values ----
    for _, row in df.iterrows():
        mep_i = int(row["member.id"])

        for col, mep_j in zip(sim_cols, col_ids):
            if mep_i == mep_j:
                continue  # skip self-loops

            sim_val = float(row[col])

            if np.isnan(sim_val):
                continue

            if sim_val < sim_threshold:
                continue

            # undirected graph: ensure each edge only once
            if not G.has_edge(mep_i, mep_j):
                G.add_edge(mep_i, mep_j, weight=sim_val)

    return G


# -----------------------------
# Batch process all your CSVs
# -----------------------------
def process_all_similarity_csvs(folder="similarity_matrices/", threshold=0.2):
    csv_files = glob.glob(os.path.join(folder, "*.csv"))

    for csv_path in csv_files:
        print("Processing:", csv_path)

        G = build_mep_network_from_csv(csv_path, sim_threshold=threshold)

        # output name
        out_path = csv_path.replace(".csv", f"_network_thresh{threshold}.gexf")

        nx.write_gexf(G, out_path)
        print("→ Saved:", out_path)


# -----------------------------
# RUN
# -----------------------------
# Example: process a single CSV
ep = 9
filename = f"similarity_matrices/sim_mat_EP{ep}.csv"
G = build_mep_network_from_csv(filename, sim_threshold=0.5)
nx.write_gexf(G, f"MEP_RESIN_EP{ep}.gexf")
