In [1]:
########################
## A simple example of optimal alignment of lists, to maximize jaccard measure of agreement (between two fitted topic models)
##
## Author: Chris Meaney
## Date: December 2023
########################

In [2]:
## Dependencies
import gurobipy as gp
from gurobipy import GRB

import numpy as np
import pandas as pd

In [3]:
## Define jaccard loss function --- returns cardinality of intersection divided by cardinality of unition of two arbitrary sets (a,b)
def jaccard_loss(a, b):
    return len(set(a) & set(b))/len(set(a) | set(b))

In [4]:
## Define some function to optimize semantic agreement - over arbitrary lists of elements A and B
## The function takes as inputs two lists of lists: A=[[],[],[]] and B=[[],[],[]] for example arising from two topic models
## The function returns the optimal alignment matrix such that avg jaccard coefficient is maximized over list elements i=1...max(len(A),len(B))
def optimize_alignment(A, B):
    model = gp.Model("list_alignment")

    # Variables
    alignment_vars = {}
    for i in range(len(A)):
        for j in range(len(B)):
            alignment_vars[i, j] = model.addVar(vtype=GRB.BINARY, name=f"x_{i}_{j}")

    # Objective function
    obj_expr = gp.LinExpr()
    for i in range(len(A)):
        for j in range(len(B)):
            obj_expr += alignment_vars[i, j] * jaccard_loss(A[i], B[j])

    model.setObjective(obj_expr, GRB.MAXIMIZE)

    # Constraints
    for i in range(len(A)):
        model.addConstr(gp.quicksum(alignment_vars[i, j] for j in range(len(B))) == 1)

    for j in range(len(B)):
        model.addConstr(gp.quicksum(alignment_vars[i, j] for i in range(len(A))) == 1)

    # Solve the model
    model.optimize()

    # Extract the solution
    alignment = []
    for i in range(len(A)):
        for j in range(len(B)):
            if alignment_vars[i, j].x == 1:
                alignment.append((A[i], B[j]))

    return alignment

In [5]:
## Import pandas dataFrame corresponding to two topical matrices 
file_dir = "C:\\Users\\ChristopherMeaney\\Desktop\\tmp\\pyGurobi_LinearAssignmentExample\\"

utopian_path = file_dir + "FINAL_UTOPIAN_TopicTable_NMF_K=50.csv"
emrpc_path = file_dir + "FINAL_EMRPC_TopicTable_NMF_K=50.csv"

utopian = pd.read_csv(utopian_path)
emrpc = pd.read_csv(emrpc_path)

In [6]:
## UTOPIAN topic table
utopian.iloc[:,1:6]

Unnamed: 0,word1,word2,word3,word4,word5
0,tylenol,advil,tab,headache,tabs
1,mg,tab,tabs,capsules,po
2,fever,diarrhea,vomiting,tylenoladvil,viral
3,neck,head,arm,headache,headaches
4,bw,iron,tsh,ferritin,thyroid
5,work,social,stress,working,treatment
6,bp,systolic,diastolic,htn,norvasc
7,sleep,bed,sleeping,apnea,insomnia
8,anxiety,anxious,panic,social,counselling
9,flu,shot,anaphylactic,influenza,ibuprofen


In [7]:
## EMRPC topic table
emrpc.iloc[:,1:6]

Unnamed: 0,word1,word2,word3,word4,word5
0,pain,chronic,tylenol,gabapentin,percocet
1,ct,scan,head,wife,ultrasound
2,inr,dosage,mg,thx,wife
3,hip,tylenol,xray,oa,physio
4,insulin,dm,lantus,diabetes,fbs
5,flu,diet,exercise,ldl,medications
6,mg,tablet,qhs,tablets,hydromorphone
7,surgery,hospital,surgeon,eye,discharge
8,er,feeling,hospital,discharge,admitted
9,urine,uti,macrobid,culture,neg


In [8]:
## Convert each of the above dataFrames into a list of list data structure
utopian_list = utopian.iloc[:,1:6].values.tolist()
emrpc_list = emrpc.iloc[:,1:6].values.tolist()

[len(utopian_list), len(emrpc_list)]

[50, 50]

In [9]:
# Optimize alignment between two topical summary matrices
alignment_result = optimize_alignment(utopian_list, emrpc_list)

Set parameter Username
Academic license - for non-commercial use only - expires 2024-03-28
Gurobi Optimizer version 10.0.1 build v10.0.1rc0 (win64)

CPU model: Intel(R) Core(TM) i5-1035G1 CPU @ 1.00GHz, instruction set [SSE2|AVX|AVX2|AVX512]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 100 rows, 2500 columns and 5000 nonzeros
Model fingerprint: 0x39902ce0
Variable types: 0 continuous, 2500 integer (2500 binary)
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e-01, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+00]
Found heuristic solution: objective 0.8730159
Presolve time: 0.04s
Presolved: 100 rows, 2500 columns, 5000 nonzeros
Variable types: 0 continuous, 2500 integer (2500 binary)

Root relaxation: objective 1.249206e+01, 134 iterations, 0.00 seconds (0.00 work units)

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf |

In [10]:
##
## Return the aligned lists as a pandas DataFrame
##
emrpc_list = []
utopian_list = []
jaccard_list = []

# Print the result
for a, b in alignment_result:
    emrpc_list.append(b)
    utopian_list.append(a)
    jaccard_list.append(len(set(a) & set(b))/len(set(a) | set(b)))

res_df = pd.DataFrame({
    'emrpc': emrpc_list,
    'utopian': utopian_list,
    'jaccard': jaccard_list
})

res_df_ = res_df.sort_values('jaccard', ascending=False)
res_df_

Unnamed: 0,emrpc,utopian,jaccard
28,"[foot, toe, feet, ankle, swelling]","[foot, swelling, ankle, toe, feet]",1.0
40,"[knee, oa, swelling, xray, medial]","[knee, swelling, oa, joint, medial]",0.666667
45,"[hip, tylenol, xray, oa, physio]","[hip, xray, oa, physio, flexion]",0.666667
37,"[back, physio, spine, legs, lumbar]","[back, spine, lumbar, flexion, physio]",0.666667
12,"[ear, ears, hearing, rt, wax]","[ear, hearing, ears, wax, cerumen]",0.666667
10,"[wt, weight, lbs, kg, loss]","[weight, kg, bmi, height, lbs]",0.428571
32,"[flu, diet, exercise, ldl, medications]","[exercise, diet, ldl, screening, cancer]",0.428571
33,"[mg, tablet, qhs, tablets, hydromorphone]","[tablets, tablet, medications, oral, mg]",0.428571
39,"[chest, clear, sob, heart, edema]","[chest, sob, cvs, edema, palpitations]",0.428571
49,"[abdo, diarrhea, soft, stool, fever]","[abdo, diarrhea, stool, bm, masses]",0.428571


In [11]:
##
## And again, compute jaccard coefficient for bags of unaligned words
##
numerator = len(set(utopian.iloc[:,1:6].to_numpy().flatten()).intersection(emrpc.iloc[:,1:6].to_numpy().flatten()))
denominator = len(set(utopian.iloc[:,1:6].to_numpy().flatten()).union(emrpc.iloc[:,1:6].to_numpy().flatten()))
numerator/denominator

0.3356401384083045

In [12]:
## And compare above Jaccard metric versus that from average of optimal aligned topics
res_df_.jaccard.mean()

0.24984126984126984

In [13]:
##########################
## Write solution to disk
##########################

In [14]:
out_path = "C:\\Users\\ChristopherMeaney\\Desktop\\tmp\\pyGurobi_LinearAssignmentExample\\MILP_Solver_JaccardObjective.csv"
res_df_.to_csv(path_or_buf=out_path, sep=',', index=False)

In [15]:
#######################################
## Properties of noteboook
#######################################

In [16]:
## Notebook last run on following date
from datetime import date
print(date.today())

2023-12-27


In [17]:
## Session information
# !pip install session_info
import session_info
session_info.show()