# Preparation

### Import packages

In [151]:
import pandas as pd
import numpy as np
import os
import random
from scipy.optimize import linprog
from ast import literal_eval

from datetime import datetime
from dateutil.relativedelta import relativedelta

from itertools import combinations 




### Discipline codes

In [1]:
# VODS disciplines codes:
discipline_codes=['0101',
'0102',
'0103',
'0104',
'0105',
'0106',
'0107',
'0199',
'0201',
'0202',
'0203',
'0204',
'0205',
'0206',
'0207',
'0208',
'0299',
'0301',
'0302',
'0303',
'0304',
'0305',
'0306',
'0399',
'0401',
'0402',
'0499',
'0501',
'0502',
'0503',
'0504',
'0505',
'0506',
'0507',
'0508',
'0599',
'0601',
'0602',
'0603',
'0604',
'0699',
'0700']


### Dataframe preparing functions

In [153]:
def from_np_array(array_string):
    array_string = ','.join(array_string.replace('[ ', '[').split())
    return np.array(literal_eval(array_string))


# function to bring a list of (level 4) disciplines to a list of level 2
def bring_to_level_2(list_of_disc):
    level_2_list=[]
    for disc in list_of_disc:
        level_2_list= level_2_list + [disc[:4]]
    return level_2_list

# function to get unique values 
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    # print list 
    return unique_list

# function that adds columns to the initial dataframe df_0:
# 'Level-2 disciplines '+ entity_string: the level 2 disciplines of a chosen entity
# 'Unique level-2 disciplines '+ entity_string: the unique level 2 disciplines of a chosen entity
# Example: (df_0, entity_df, entity_string) = (persons,organisations, 'Organisations')
def get_related_level2_disciplines(df_0, entity_df, entity_string):
    level2_col=[]
    for i in range(len(df_0)):
        level2=[]
        for entity_id in df_0[entity_string][i]:
            if len(entity_df[entity_df['ID'] == entity_id]['Level-2 disciplines']) == 0:
                entity_level2=[] 
            else:
                entity_level2=entity_df[entity_df['ID'] == entity_id]['Level-2 disciplines'].iloc[0]
            level2=level2 + [entity_level2]
        level2_col=level2_col+[level2]
    df_0['Level-2 disciplines '+ entity_string]=level2_col
#     df_0['Unique level-2 disciplines '+ entity_string]=df_0['Level-2 disciplines '+ entity_string].apply(lambda x: unique(x))

def difference_in_months(datetime_end, datetime_start):
    time_difference = relativedelta(datetime_end, datetime_start)
    difference_in_years = time_difference.years
    if time_difference.days > 14:
        extra_month = 1
    else:
        extra_month = 0
    rest_months = time_difference.months +extra_month
    difference_in_months = difference_in_years*12 + rest_months
    
    return difference_in_months

def string_to_datetime(date_str):
    if 'Z' in date_str:
        date_str=date_str.replace('Z','')
    return datetime.strptime(date_str, '%Y-%m-%d').date()

# From disciplines of organisations, projects or publications to the (weighted) aggregate discipline vector      
def disciplines_to_weighted_aggregate_vector(list_of_lists, discipline_codes, size_and_duration = None):
    list_of_vectors=[]
    for i in range(len(list_of_lists)):
        vector = make_discipline_vector(list_of_lists[i],discipline_codes)
        if size_and_duration is None:
            weight=1
        else:
            size = size_and_duration[i][0]
            duration = size_and_duration[i][1]
            weight = duration/size 
        list_of_vectors = list_of_vectors+[(vector,weight)]
    aggregate_vector = add_vectors_with_weights(list_of_vectors)
    return aggregate_vector

# function takes a list with (non-unique) codes and gives a discipline vector (np.array)
def make_discipline_vector(level2_list, discipline_codes):
    disc_vector = np.array([0]*len(discipline_codes))
    for i in range(len(discipline_codes)):
        disc_vector[i]=level2_list.count(discipline_codes[i])
        
    if sum(disc_vector)>0:
        normalized_vector=disc_vector/sum(disc_vector)
    else:
        normalized_vector=disc_vector
    return normalized_vector

def add_vectors_with_weights(list_of_vectors_and_weights, size=42):
    total_vector=np.zeros(size)
    for (vector,weight) in list_of_vectors_and_weights:
        total_vector=total_vector + vector*weight
        
    if sum(total_vector)>0:
        normalized_vector=total_vector/sum(total_vector)
    else:
        normalized_vector=total_vector
    return normalized_vector

def normalize(array):
    norm = array.sum()
    if norm == 0:
        return array
    else:
        normed=array/norm
        if normed.sum() == 1:
            return normed
        else:
            idx=random.choice(np.nonzero(normed)[0])
            normed[idx]=normed[idx]-(normed.sum()-1)
            return normed
        
def remove_bookcomponent(list_of_pubs):
    clean_list=[]
    for pub_id in list_of_pubs:
        pub_id = pub_id.replace('book-component:', '')
        clean_list=clean_list+[pub_id]
    return clean_list

### Wasserstein functions

In [154]:
# Transform sparse arrays p,q and M to dense arrays by deleting entries (disciplines) which are zero for both p and q.
def discipline_compressor(p,q,M):

    zero_entries=[]

    for i in range(len(p)):
        if p[i]==0 and q[i]==0:
            zero_entries.append(i)
            
    zero_entries.sort(reverse=True)
    p_dense=p.copy()
    q_dense=q.copy()
    M_dense=M.copy()

    for j in zero_entries:
        M_dense=np.delete(M_dense, j, 1)
        M_dense=np.delete(M_dense, j, 0)
        p_dense= np.delete(p_dense,j)
        q_dense= np.delete(q_dense,j)
    
    return p_dense,q_dense,M_dense

def constraint_maker(n):
    constraint_matrix=np.zeros((2*n,n**2))

    for i in range(n):
        constraint_matrix[i][i*n:i*n+n]=np.ones(n)
        for j in range(n):
            constraint_matrix[i+n][n*j+i]=1

    constraint_matrix=np.delete(constraint_matrix,0,0) #delete first (redundent) row to get full row rank
    return constraint_matrix

def wasserstein(p,q,M):
    p_dense,q_dense,M_dense=discipline_compressor(p,q,M)

    obj = M_dense.flatten()


    lhs_eq = constraint_maker(len(p_dense))  
    rhs_eq = np.append(p_dense,q_dense)[1:]      

    opt = linprog(c=obj, 
                  A_eq=lhs_eq, b_eq=rhs_eq, 
                  method="revised simplex")
    return opt.fun


### Load and prepare data

In [155]:
# load distance matrix M (between disciplines)
path = r'C:\Users\lucp11051\Documents\Jupyter_notebooks\2_Discipline_distances/'

count_type = 'citation_flow'
data = 'matrix15'
count_type = 'disc_in_a_person'
data = 'pub_2014'

M = np.loadtxt(path + 'cosine_distance_matrix_' + count_type + data+ '.txt')

In [156]:
number = '0'
string = ''
cur_path = os.getcwd()
filename = number + string  + 'vectors_with_alphas.csv'
new_path = os.path.relpath('..\\1_Researcher_disciplines/'+filename, cur_path)
vectors = pd.read_csv(new_path,converters={'Profile': from_np_array, 'Organisations':from_np_array, 
                                                'Projects': from_np_array, 'Coauthors Projects':from_np_array,
                                                'Publications': from_np_array, 'Coauthors Publications':from_np_array, 'Alphas':from_np_array})
vectors

Unnamed: 0,ID,Profile,Organisations,Projects,Coauthors Projects,Publications,Coauthors Publications,Alphas
0,e011bd37-a5a1-46af-bf93-cf0a36b66ab5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.42424121, 0.42424121, 0.0478696, 0.00778755..."
1,8834112c-33f1-4ee6-8a34-9c09a7b71535,"[0.0, 0.0, 0.0, 0.05, 0.0, 0.25, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.33333333, 0.0, 0.33333333, 0...","[0.0, 0.00096712, 0.0, 0.00151976, 0.0, 0.3341...","[0.02592593, 0.01481481, 0.0, 0.02066667, 0.0,...","[0.0, 0.0, 0.01904762, 0.06666667, 0.0, 0.1047...","[0.00081169, 0.00550219, 0.00505051, 0.0232654...","[0.29744163, 0.29744163, 0.14181181, 0.1487208..."
2,b73e0e47-ad11-4ce7-a065-2993e6c3b602,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.03125, 0.0, 0.32291667, 0.03125, ...","[0.03571429, 0.0, 0.0, 0.03571429, 0.32142856,...","[0.0, 0.0, 0.00694444, 0.00529101, 0.37314815,...","[0.0, 0.0, 0.0, 0.0, 0.01851852, 0.0, 0.055555...","[0.00344828, 0.02413793, 0.0, 0.00689655, 0.04...","[0.0, 0.31572506, 0.2685494, 0.21379904, 0.051..."
3,f19619e2-340d-47ef-9792-35d2436febc4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.01666667, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.017...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.29592924, 0.29592924, 0.14796462, 0.1479646..."
4,45230f30-5ba1-4ba1-a305-1307392c1cc1,"[0.0, 0.0, 0.0, 0.0, 0.41666667, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.13684211, 0.03508772, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.19166667, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.03076923, 0.0, 0.111538...","[0.03385417, 0.0, 0.0, 0.0, 0.16062128, 0.0489...","[0.32856229, 0.32856229, 0.11891404, 0.0974250..."
...,...,...,...,...,...,...,...,...
59,6c4f337a-6fac-41fc-9224-dc0ccbdeeebb,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.66666667, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.03576657, 0.0, 0.46760667, 0...","[0.004536, 0.05308192, 0.0132722, 0.05049871, ...","[0.0, 0.0, 0.0, 0.03639847, 0.0, 0.57088123, 0...","[0.0044011544, 0.0352775726, 0.000505050505, 0...","[0.30678675, 0.30678675, 0.13892685, 0.1533933..."
60,4b74615b-d701-4448-a191-cb19436ee4cb,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.016...","[0.0, 0.0, 0.04444444, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 0.0, 0.16909722, 0.19826389, 0.015625, 0...","[0.00064935, 0.0010101, 0.01818182, 0.0, 0.000...","[0.30921342, 0.30921342, 0.14852896, 0.1403189..."
61,b6fd60a4-c095-43ed-9743-67cd091db492,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.10984848, 0.0, 0.0...","[0.0, 0.000123915737, 0.000247831475, 0.0, 0.0...","[0.29717907, 0.23432379, 0.14858954, 0.1485895..."
62,37b7164d-6bd4-470b-98d5-33b349dfd2b5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.18333333, 0.0, 0.0...","[0.0, 0.000333333333, 0.000666666667, 0.0, 0.0...","[0.33895976, 0.23399294, 0.12275748, 0.1227574..."


In [197]:
cur_path = os.getcwd()
filename =number+ 'projects_0_prepared.csv'
new_path = os.path.relpath('..\\0_Preparing_dataframes/'+filename, cur_path)
projects_0 = pd.read_csv(new_path)

projects_0['Disciplines'] = projects_0['Disciplines'].apply(lambda x: literal_eval(x))
projects_0['Persons'] = projects_0['Persons'].apply(lambda x: unique(literal_eval(x)))
projects_0['Organisations'] = projects_0['Organisations'].apply(lambda x: unique(literal_eval(x)))
projects_0['Publications'] = projects_0['Publications'].apply(lambda x: unique(remove_bookcomponent(literal_eval(x))))
projects_0['Level-2 disciplines']=projects_0['Level-2 disciplines'].apply(lambda x: literal_eval(x))
projects_0['Alias'] = projects_0['Alias'].apply(lambda x: literal_eval(x))
projects_0['Profile']=projects_0['Level-2 disciplines'].apply(lambda x: disciplines_to_weighted_aggregate_vector([x],discipline_codes))

In [198]:
# add the project array column
def make_aggregation(vectors_df, alphas_list=None):    
    aggregation_col=[]
    for i in range(len(vectors_df)):
        if alphas_list is not None:
            alphas = alphas_list
        else:
            alphas=vectors_df.loc[i,'Alphas']
        aggregation=np.zeros(42)
        for j in range(len(alphas)):
            aggregation = aggregation + alphas[j]*np.array(vectors_df.iloc[i,j+1])
        
        aggregation_col = aggregation_col + [normalize(aggregation)]
    vectors_df['Aggregation']=aggregation_col
    return vectors_df
    
        
def make_project_array(projects_df, vectors_df):
    projects_col=[]
    for i in range(len(projects_df)):
        project_array=[]
        for person_id in projects_df['Persons'][i]:
            if len(vectors_df[vectors_df['ID'] == person_id]['Aggregation']) == 0:
                disc_vector=[] 
                print('Person ID ' + str(person_id) +' cannot be found.')
            else:
                disc_vector=vectors_df[vectors_df['ID'] == person_id]['Aggregation'].iloc[0]
            project_array=project_array + [disc_vector]
        projects_col=projects_col+[project_array]
    projects_df['Project array']=projects_col
    return projects_df
    
vectors = make_aggregation(vectors)
projects_0 = make_project_array(projects_0,vectors)

In [199]:
# Fake people
p1 = np.zeros(42)
p1[0]=1
p2 = np.zeros(42)
p2[0]=1
p3 = np.zeros(42)
p3[0]=0.5
p3[1]=0.5
p4 = np.zeros(42)
p4[0]=0.45
p4[1]=0.55
p5 = np.zeros(42)
p5[30]=1
p6 = np.zeros(42)
p6[20]=0.5
p6[21]=0.5
p7=np.full(42,1/42)
p8 = np.zeros(42)
p8[1]=1

# fake projects
P1 = ['P1','math', [], np.array([p1]),1,p1]
P2 = ['P2','math+math', [], np.array([p1,p2]),2,p1]
P3 = ['P3','math + cs/math', [], np.array([p1,p3]),2,p3]
P4 = ['P4','2 very similar', [], np.array([p3,p4]),2,p3]
P5 = ['P5','diverse with 1 math', [], np.array([p1,p5,p6]),3,p1]
P6 = ['P6','diverse with 2 math', [], np.array([p1,p2,p5,p6]),4,p1]
P7 = ['P7','diverse with 2 math+1poly', [], np.array([p1,p2,p5,p6,p7]),5,p1]
P8 = ['P8','1 polyvalent', [], np.array([p7]),1,p3]
P9 = ['P9','math + distant?', [], np.array([p1,p6]),2,p7]
P10 = ['P10','math + close?', [], np.array([p1,p8]),2,p7]
fake_projects=[P1, P2, P3, P4, P5, P6, P7, P8, P9, P10]
fake_projects_df=pd.DataFrame(fake_projects,
                   columns=['ID', 'Name', 'Alias', 'Project array', 'Size', 'Profile'])
# add fake projects



projects_0=projects_0.append(fake_projects_df, ignore_index=True)


fake_projects_df

Unnamed: 0,ID,Name,Alias,Project array,Size,Profile
0,P1,math,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,P2,math+math,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,P3,math + cs/math,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,P4,2 very similar,[],"[[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,P5,diverse with 1 math,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,P6,diverse with 2 math,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",4,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,P7,diverse with 2 math+1poly,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",5,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,P8,1 polyvalent,[],"[[0.023809523809523808, 0.023809523809523808, ...",1,"[0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,P9,math + distant?,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[0.023809523809523808, 0.023809523809523808, 0..."
9,P9,math + close?,[],"[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2,"[0.023809523809523808, 0.023809523809523808, 0..."


In [200]:
projects_0

Unnamed: 0,ID,Name,Start,End,Disciplines,Persons,Organisations,Publications,Alias,when_added,Duration,Level-2 disciplines,Size,Profile,Project array
0,fdde0090-f6ed-4938-91a6-13d370125cde,ICP Interuniversity Programme Physical land re...,2015-09-22,2016-09-30,"[04010303, 01070405, 01050313]",[b73e0e47-ad11-4ce7-a065-2993e6c3b602],"[2b317cdb-3a20-47d7-b2bd-75aa36eb9734, f28fd86...",[],[],0.0,12.0,"[0401, 0107, 0105]",1,"[0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, ...","[[0.01011045692737233, 0.0036358358151722683, ..."
1,0f80da13-5ac3-45c0-a0df-c2bd93051dbe,ERC Professorship GlycoTarget,2014-03-01,2019-02-28,"[010608, 030130, 030116, 030617, 030630, 03012...",[8834112c-33f1-4ee6-8a34-9c09a7b71535],[adbfa1d6-6a4b-4abf-8dc4-ea71c647af26],[],[],0.0,60.0,"[0106, 0301, 0301, 0306, 0306, 0301, 0106, 030...",1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.21428571428571427,...","[[0.0039038559556490676, 0.0026666803951537893..."
2,0e185133-3ddf-4f8a-81d6-2ddc76a49cb6,EU Promotion of Democratic Governance via Func...,2016-07-01,2017-01-31,"[050604, 050602, 050699, 050607, 050606, 05060...","[f19619e2-340d-47ef-9792-35d2436febc4, e011bd3...",[45ff2fc2-f78b-489c-ae38-b6fca2022521],[],[],0.0,7.0,"[0506, 0506, 0506, 0506, 0506, 0506, 0506, 050...",2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.00246607746851462..."
3,fc7f8873-326f-41fe-9c75-08ea05d8058b,Somite development in 3D in vitro synthetic mi...,2018-01-01,2018-12-31,[010699],[05e00835-520a-4d3b-8e53-fbe03c1e43c7],[b9b3c61e-650e-4463-a0e2-6de8f49917ba],[],[],0.0,12.0,[0106],1,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.006527271051736335, 0.0045532735..."
4,9be600ee-e306-4fac-8c4b-0e5134b9ff28,Housing for Refugee Inclusion exploring inclus...,2020-01-01,2023-12-31,"[02011199, 05040413, 05040702, 05060199, 06040...",[3de615bf-4167-4fa5-a4d3-0dac8248798a],[3420d399-29f4-4edf-9f9d-252e7b1d400d],[],[],0.0,48.0,"[0201, 0504, 0504, 0506, 0604]",1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.49..."
5,ee843e8f-9c18-4c14-bfee-641e18fbd8e0,Lost tributaries of the Scheldt reconstructing...,2019-01-01,2019-12-31,"[06010119, 06010124, 06010108, 06010116]","[45230f30-5ba1-4ba1-a305-1307392c1cc1, acb40a5...",[eda251f4-4796-4ac5-a6de-0e58a70f58aa],[],[],0.0,12.0,"[0601, 0601, 0601, 0601]",2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0028569019455507044, 0.0, 0.0, 0.0, 0.1680..."
6,bf2e0a13-e0bf-48cd-8339-c8c256770b1c,The association between motor skill learning a...,2020-08-04,2024-08-04,"[03011804, 03011899, 03012306, 03012301, 03012...","[d86af692-304d-42dd-bb1f-86d0113be027, 24e9ba0...","[d4a49766-ed89-4de9-a06b-3c45f728354f, 3597372...",[],[],0.0,48.0,"[0301, 0301, 0301, 0301, 0301]",4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.00432476951966804, 0.0005451872859852065, ..."
7,7a8539e2-93ad-4864-a4c7-405ec848fdd4,Towards dynamic navigational charts for highly...,2020-09-04,2024-09-04,[02020301],"[e739bf7a-76c2-4e6e-8b6e-4dcf863be95f, de9b527...","[4ad02e7f-d5fa-4d66-98bb-d2f63929649a, 4c3d49c...",[],[],0.0,48.0,[0202],2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.002735615519237653, 0.036133047010030485, ..."
8,70295923-ab2c-44c6-ac9f-2ff4f28b5a75,Analysing the impact of head end tail end effe...,2020-05-14,2024-05-14,"[01070310, 04010302, 01050607]","[8c662e7b-3e22-4f37-bc2f-a4c7f8593181, 8d75503...","[f74d3c9c-2f54-4b95-9b41-61afcfe2a878, f1d3171...",[],[],0.0,48.0,"[0107, 0401, 0105]",3,"[0.0, 0.0, 0.0, 0.0, 0.3333333333333333, 0.0, ...","[[0.0010219851943710353, 0.0030239763953679383..."
9,c04fd85a-d90f-438d-b06a-e5606588ba84,2D materials optomechanics in Silicon photonics,2021-01-01,2021-06-30,"[01030999, 01030904]","[2e1cb018-8750-4c8e-a23b-a5244883a5a6, c405840...",[16a28d37-6005-49e4-be1b-46b8c76f6640],[],[],0.0,6.0,"[0103, 0103]",2,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.061052567780869677, 0.0, 0.0, 0...."


# Rao stirling index

### Rao Stirling functions

In [161]:
def unique_person_project(Project):
    freq_list=[]
    for unique in np.unique(Project, axis=0):
        count=0
        for pers in Project:   
            if all(unique == pers):
                count=count+1

        freq_list=freq_list+[(unique,count)]
    return freq_list
def RaoStirling(Project,M):
    freq_list=unique_person_project(Project)
    size=len(Project)
    comb = list(combinations(freq_list, 2))
    som=0
    for i in list(comb): 
        freq1=i[0][1]/size
        freq2=i[1][1]/size
        som=som+wasserstein(i[0][0],i[1][0],M)*freq1*freq2
    return som


### Calculation

In [162]:

for i in range(projects_0.shape[0]):
    P=projects_0['Project array'][i]
    print(RaoStirling(P,M), projects_0['Size'][i])

0 1
0 1
0.04168858529764472 2
0 1
0 1
0.06651204104029236 2
0.10007580785081517 4
0.039576704118720274 2
0.09952531385691944 3
0.10356102547244074 2
0.07021786571659733 2
0.0991605840467326 2
0.1054743285178599 4
0.12094363842721917 2
0 1
0.17283393027195168 2
0.21362452583824676 2
0.04249974634949182 4
0.030811431349833128 3
0.01572142129024831 2
0.023599869714762873 2
0 1
0 1
0.2509481595203323 3
0 1
0 1
0.19678702895506525 2
0.01139093478947949 2
0.17998361638895263 2
0.12396639628684936 2
0 2


# ADCM (Average distance to center of mass)

## ADC (average distance to centroid)

### ADC functions

In [163]:
def ADC(Project,M):
    centroid=np.sum(Project,axis=0)/len(Project)
    som=0
    for person in Project:
        som=som+wasserstein(person, centroid,M)
    return som/len(Project)

###  Calculation

In [164]:
# for i in range(projects_0.shape[0]):
#     P=projects_0['Project array'][i]
#     print(ADC(P,M), projects_0['Size'][i])

9.09671700778658e-17 1
6.945810032592605e-17 1
0.08337717059528946 2
1.769627263658105e-17 1
0.0 1
0.13302408208058472 2
0.1765472395144334 4
0.07915340823744042 2
0.1805749830742346 3
0.20712205094488137 2
0.14043573143319485 2
0.198321168093465 2
0.176632738251772 4
0.2418872768544385 2
1.1700025342366997e-16 1
0.34566786054390286 2
0.42724905167649324 2
0.06872576927967797 4
0.05544136113399933 3
0.031442842580496594 2
0.047199739429525656 2
5.888087790512607e-17 1
1.2109350068718101e-17 1
0.453041261915456 3
2.2038060962658744e-17 1
6.57066906993662e-17 1
0.39357405791013045 2
0.022781869578958967 2
0.3599672327779053 2
0.2479327925736987 2
0.0 2


## ADM (average distance to medoid)

### ADM functions

In [165]:
def ADM(Project,M):
    smallest_som=-1
    for possible_medoid in Project:
        som=0
        for person in Project:
            som=som+wasserstein(person, possible_medoid,M)
        if smallest_som==-1:
            smallest_som=som
        elif som<smallest_som:
            smallest_som=som
    
    return smallest_som/len(Project)

###  Calculation

In [166]:
# for i in range(projects_0.shape[0]):
#     P=projects_0['Project array'][i]
#     print(ADM(P,M), projects_0['Size'][i])

9.09671700778658e-17 1
6.945810032592605e-17 1
0.08337717059528944 2
1.769627263658105e-17 1
0.0 1
0.13302408208058475 2
0.15019784402190328 4
0.07915340823744048 2
0.15917606047800995 3
0.2071220509448815 2
0.14043573143319468 2
0.1983211680934652 2
0.15574208783764834 4
0.24188727685443837 2
1.1700025342366997e-16 1
0.34566786054390325 2
0.4272490516764935 2
0.06161369783238294 4
0.05771375777732002 3
0.031442842580496594 2
0.04719973942952571 2
5.888087790512607e-17 1
1.2109350068718101e-17 1
0.45150563887000184 3
2.2038060962658744e-17 1
6.57066906993662e-17 1
0.39357405791013034 2
0.022781869578958985 2
0.35996723277790527 2
0.2479327925736987 2
0.0 2


## ADPD (average distance to project discipline)

### ADPD functions

In [209]:
def ADPD(Project, M, project_discipline):
    centroid=project_discipline
    som=0
    for person in Project:
        som=som+wasserstein(person, centroid,M)
    return som/len(Project)

def MDPD(Project, M, project_discipline):
    maximum_dist=0
    for person in Project:
        maximum_dist=max(maximum_dist,wasserstein(person, project_discipline,M))
    return maximum_dist
    

###  Calculation

In [210]:
for i in range(projects_0.shape[0]):
    P=projects_0['Project array'][i]
    PD=projects_0['Profile'][i]
    print(MDPD(P,M,PD), projects_0['Size'][i])

0.44933504134016733 1
0.18862426190946263 1
0.24195716909172882 2
0.5528479805976219 1
0.49928482582360223 1
0.4137764322585176 2
0.5985616471115666 4
0.5816294509641005 2
0.38374218279193967 3
0.861910577002097 2
0.28087146286638937 2
0.5607536772588616 2
0.48125974978359 4
0.4692760052131571 2
0.8798899670035574 1
0.7609224806200657 2
0.8724867586848322 2
0.24322314978189297 4
0.5860314891441822 3
0.10047251566759235 2
0.3285105449839289 2
0.7854057023656175 1
0.46765242062322826 1
0.5469264821084591 3
0.33585083807879146 1
0.19379093655162566 1
0.858374405121978 2
0.1161572657230773 2
0.8028037765839477 2
0.6010622122583796 2
0 2
0 1
0 2
0.29815525555 2
0.029815525555000028 2
0.9740964657 3
0.9740964657 4
0.9740964657 5
0.8065244397428571 1
0.8907012437785713 2
0.8907012437785713 2


# Diameter

In [206]:
def diameter(Project, M):
    diameter=0
    for i in range(len(Project)):
        for j in range(i):
            diameter=max(diameter,wasserstein(Project[i], Project[j],M))
    return diameter

In [207]:
# for i in range(projects_0.shape[0]):
#     P=projects_0['Project array'][i]
#     print(diameter(P,M), projects_0['Size'][i])

0 1
0 1
0.16675434119057886 2
0 1
0 1
0.2660481641611695 2
0.4289885787448641 4
0.1583068164748811 2
0.4181996432782454 3
0.4142441018897631 2
0.2808714628663893 2
0.3966423361869304 2
0.4350506269426061 4
0.48377455370887656 2
0 1
0.6913357210878066 2
0.8544981033529868 2
0.1949212766313257 4
0.10416160881653816 3
0.06288568516099324 2
0.09439947885905149 2
0 1
0 1
0.9040165190729852 3
0 1
0 1
0.7871481158202607 2
0.04556373915791796 2
0.7199344655558105 2
0.49586558514739737 2
0 2
0 1
0 2
0.29815525555 2
0.029815525555000028 2
0.9823224346 3
0.9823224346 4
0.9823224346 5
0 1
0.9740964657 2
0.5963105111 2


# Comparision

In [224]:
U=np.ones((42,42))-np.identity(42)
cur_path = os.getcwd()
filename = 'idr_ids.txt'
new_path = os.path.relpath('..\\Other/'+filename, cur_path)
idr_array = np.loadtxt(new_path, dtype=str, ndmin=1)

df = pd.DataFrame([], columns=['ID', 'Name', 'Size','RaoStirling', 'ADC', 'ADM', 'ADPD',  'MDPD', 'Diameter', 'FWO_IDR'])
for i in range(0,len(projects_0)):
    fwo_idr = 0    
    if ('Projects/'+projects_0['ID'][i]) in idr_array:
        fwo_idr=1
    else:
        for alias in projects_0.loc[i,'Alias']:
            if ('Projects/'+alias) in idr_array:
                fwo_idr=1
                
    P=projects_0['Project array'][i]
    PD=projects_0['Profile'][i]
    df2 = pd.DataFrame([[projects_0['ID'][i], projects_0['Name'][i], projects_0['Size'][i], RaoStirling(P,M), \
                         ADC(P,M), ADM(P,M), ADPD(P,M,PD), MDPD(P,M,PD), diameter(P,M), fwo_idr]],\
                       columns=['ID', 'Name', 'Size','RaoStirling', 'ADC', 'ADM', 'ADPD', 'MDPD', 'Diameter', 'FWO_IDR'])
    df=df.append(df2, ignore_index=True)
    
df = df.astype({'RaoStirling': 'float64', 'Diameter': 'float64'})
df=df.round(3)
df = df.astype({'RaoStirling': 'object', 'Diameter': 'object', 'ADC': 'object', 'MDPD': 'object', 'ADM': 'object'})
df

Unnamed: 0,ID,Name,Size,RaoStirling,ADC,ADM,ADPD,MDPD,Diameter,FWO_IDR
0,fdde0090-f6ed-4938-91a6-13d370125cde,ICP Interuniversity Programme Physical land re...,1,0.0,0.0,0.0,0.449,0.449,0.0,0
1,0f80da13-5ac3-45c0-a0df-c2bd93051dbe,ERC Professorship GlycoTarget,1,0.0,0.0,0.0,0.189,0.189,0.0,0
2,0e185133-3ddf-4f8a-81d6-2ddc76a49cb6,EU Promotion of Democratic Governance via Func...,2,0.042,0.083,0.083,0.233,0.242,0.167,0
3,fc7f8873-326f-41fe-9c75-08ea05d8058b,Somite development in 3D in vitro synthetic mi...,1,0.0,0.0,0.0,0.553,0.553,0.0,1
4,9be600ee-e306-4fac-8c4b-0e5134b9ff28,Housing for Refugee Inclusion exploring inclus...,1,0.0,0.0,0.0,0.499,0.499,0.0,1
5,ee843e8f-9c18-4c14-bfee-641e18fbd8e0,Lost tributaries of the Scheldt reconstructing...,2,0.067,0.133,0.133,0.294,0.414,0.266,1
6,bf2e0a13-e0bf-48cd-8339-c8c256770b1c,The association between motor skill learning a...,4,0.1,0.177,0.15,0.484,0.599,0.429,0
7,7a8539e2-93ad-4864-a4c7-405ec848fdd4,Towards dynamic navigational charts for highly...,2,0.04,0.079,0.079,0.555,0.582,0.158,0
8,70295923-ab2c-44c6-ac9f-2ff4f28b5a75,Analysing the impact of head end tail end effe...,3,0.1,0.181,0.159,0.238,0.384,0.418,0
9,c04fd85a-d90f-438d-b06a-e5606588ba84,2D materials optomechanics in Silicon photonics,2,0.104,0.207,0.207,0.728,0.862,0.414,0


In [225]:
df.sort_values(by=['Size'])

Unnamed: 0,ID,Name,Size,RaoStirling,ADC,ADM,ADPD,MDPD,Diameter,FWO_IDR
0,fdde0090-f6ed-4938-91a6-13d370125cde,ICP Interuniversity Programme Physical land re...,1,0.0,0.0,0.0,0.449,0.449,0.0,0
1,0f80da13-5ac3-45c0-a0df-c2bd93051dbe,ERC Professorship GlycoTarget,1,0.0,0.0,0.0,0.189,0.189,0.0,0
38,P8,1 polyvalent,1,0.0,0.0,0.0,0.807,0.807,0.0,0
3,fc7f8873-326f-41fe-9c75-08ea05d8058b,Somite development in 3D in vitro synthetic mi...,1,0.0,0.0,0.0,0.553,0.553,0.0,1
4,9be600ee-e306-4fac-8c4b-0e5134b9ff28,Housing for Refugee Inclusion exploring inclus...,1,0.0,0.0,0.0,0.499,0.499,0.0,1
22,5a480e0c-b39d-455b-9bda-318f348113ab,Valorisation of Lignin by development of selec...,1,0.0,0.0,0.0,0.468,0.468,0.0,1
24,8a49c146-c7f3-4c49-9919-ad9361b5c425,An inkjet characterization platform for improv...,1,0.0,0.0,0.0,0.336,0.336,0.0,1
25,d72e5b0c-82e5-48d0-b8d1-fc7a8e8fb3cf,The Bright Side of Being Wrong The Perils and ...,1,0.0,0.0,0.0,0.194,0.194,0.0,1
31,P1,math,1,0.0,0.0,0.0,0.0,0.0,0.0,0
21,39099cca-b824-4346-81bd-0cff84e379a5,Validation of mathematical model of pain chara...,1,0.0,0.0,0.0,0.785,0.785,0.0,1
