# This Jupyter notebook provides sample data and code to calculate the Interdisciplinary Research (IDR) index using an organizational approach, as described in the paper titled "Identifying Interdisciplinary Research in Research Projects."

In [67]:
import numpy as np
import builtins
from scipy.optimize import linprog

## Sample data

In [81]:
# Let's consider a Research Information System (RIS) that utilizes five disciplines to classify research documents.
# The distance matrix between these disciplines can be computed as follows:
M = np.array([[0.0, 0.9, 0.7, 0.2, 0.8],
              [0.9, 0.0, 0.8, 0.1, 0.3],
              [0.7, 0.8, 0.0, 0.6, 0.5],
              [0.2, 0.1, 0.6, 0.0, 0.9],
              [0.8, 0.3, 0.5, 0.9, 0.0]])

# Each researcher is defined as an array of discipline distributions (the total sum of weights should be 1).
p1=np.array([0.3, 0.4, 0.3, 0.0, 0.0])
p2=np.array([0.0, 0.0, 0.0, 0.5, 0.5])
p3=np.array([0.0, 0.0, 1.0, 0.0, 0.0])
p4=np.array([0.0, 1.0, 0.0, 0.0, 0.0])
p5=np.array([0.2, 0.2, 0.2, 0.4, 0.0])
p6=np.array([0.0, 0.0, 0.0, 0.6, 0.4])

# Each project involves a list of researchers. For example, R1 and R2 are two list of researchers involved in project p1 and p2:
R1 = [p1,p2,p3]
R2 = [p4,p5,p6]

# Similar to researcher, each project is also defined as an array of discipline distribution:
p1 = np.array([0.5, 0.5, 0.0, 0.0, 0.0])
p2 = np.array([0.5, 0.0, 0.5, 0.0, 0.0])

### Define procedures to calculate wasserstein distance

In [69]:
# Procedures to calcuate wassetein distance between two researchers

# Transform sparse arrays p,q and M to dense arrays by deleting entries (disciplines) which are zero for both p and q.
def discipline_compressor(p, q, M):
    zero_entries = []
    for i in range(len(p)):
        if p[i] == 0 and q[i] == 0:
            zero_entries.append(i)
    zero_entries.sort(reverse=True)
    p_dense = p.copy()
    q_dense = q.copy()
    M_dense = M.copy()
    for j in zero_entries:
        M_dense = np.delete(M_dense, j, 1)
        M_dense = np.delete(M_dense, j, 0)
        p_dense = np.delete(p_dense, j)
        q_dense = np.delete(q_dense, j)
    return p_dense, q_dense, M_dense


# Make constraints for the linear programming
def constraint_maker(n):
    constraint_matrix = np.zeros((2 * n, n ** 2))
    for i in range(n):
        constraint_matrix[i][i * n:i * n + n] = np.ones(n)
        for j in range(n):
            constraint_matrix[i + n][n * j + i] = 1
    constraint_matrix = np.delete(constraint_matrix, 0, 0)  # delete first (redundent) row to get full row rank
    return constraint_matrix


# calculate the wasserstein distance between person p and q given distance matrix M
def wasserstein(p, q, M):
    p_dense, q_dense, M_dense = discipline_compressor(p, q, M)
    obj = M_dense.flatten()
    lhs_eq = constraint_maker(len(p_dense))
    rhs_eq = np.append(p_dense, q_dense)[1:]
    opt = linprog(c=obj,
                  A_eq=lhs_eq, b_eq=rhs_eq,
                  method="revised simplex")
    return opt.fun

### Define procedures to calculate diversity

In [76]:
def unique_person_project(R):
    """
    This function returns a list with every unique person in a project together with its frequency
    :param R list of researchers
    :return
    """
    if len(R) == 1:
        return [(R[0], 1)]
    freq_list = []
    for unique in np.unique(R, axis=0):
        count = 0
        for pers in R:
            if builtins.all(unique == pers):
                count = count + 1
        freq_list = freq_list + [(unique, count)]
    return freq_list

# Define procedure to calculate diversity of researchers R
def diversity_calculation(R, M):
    """
    Calculate diversity of researchers
    :param R list of researchers
    :param M distance matrix
    :return diversity score
    """
    diversity = 0
    nb_researchers = len(R)
    uni_R = unique_person_project(R)
    nb_uni_researchers = len(uni_R)
    for i in range(nb_uni_researchers-1):
        fi = uni_R[i][1] / nb_researchers
        sum_j = 0
        for j in range(i + 1, nb_uni_researchers):
            fj = uni_R[j][1] / nb_researchers
            dm_ij = wasserstein(uni_R[i][0], uni_R[j][0], M)
            sum_j += fj * dm_ij
        diversity += fi * sum_j

    return diversity

### Define procedures to calculate relevancy weight

In [86]:
# define procedure to calcualte relevancy weights of researchers
def relevancy_weight(R, p, M):
    """
    calculate relevancy weight of a list of researchers with the project
    :param R list of distribution of disciplines of researchers
    :param p distribution of disciplines of project
    :param M distance matrix
    :return a list of values, each one is the relevancy weight of a researchers
    """
    min_distance = []
    # for each researcher
    for r in R:
        # find minimum distance of disciplines
        ri_min_distance = 1
        for i in range(len(r)):
            if r[i] != 0:
                for j in range(len(p)):
                    if p[j]!=0:
                        ri_min_distance = min(ri_min_distance, M[i][j])
        min_distance.append(1-ri_min_distance)
    return min_distance

### calculate diversity of researchers R1

In [82]:
diversity_calculation(R1,M)

0.15222222222222223

### calculate diversity of researchers R2

In [83]:
diversity_calculation(R2,M)

0.08444444444444445

### calculate relevancy weights of researchers R1 and project p1

In [87]:
relevancy_weight(R1,p1,M)

[1.0, 0.9, 0.30000000000000004]

### calculate relevancy weights of researchers R2 and project p2

In [88]:
relevancy_weight(R2,p2,M)

[0.19999999999999996, 1.0, 0.8]