In [8]:
import pandas as pd
import numpy as np
import os
import random

from ast import literal_eval

from itertools import combinations 


In [9]:
# VODS disciplines codes:
discipline_codes=['0101',
'0102',
'0103',
'0104',
'0105',
'0106',
'0107',
'0199',
'0201',
'0202',
'0203',
'0204',
'0205',
'0206',
'0207',
'0208',
'0299',
'0301',
'0302',
'0303',
'0304',
'0305',
'0306',
'0399',
'0401',
'0402',
'0499',
'0501',
'0502',
'0503',
'0504',
'0505',
'0506',
'0507',
'0508',
'0599',
'0601',
'0602',
'0603',
'0604',
'0699',
'0700']


In [10]:
# function that adds columns to the initial dataframe df_0:
# 'Level-2 disciplines '+ entity_string: the level 2 disciplines of a chosen entity
# 'Unique level-2 disciplines '+ entity_string: the unique level 2 disciplines of a chosen entity
# Example: (df_0, entity_df, entity_string) = (persons,organisations, 'Organisations')
def get_related_level2_disciplines(df_0, entity_df, entity_string):
    level2_col=[]
    for i in range(len(df_0)):
        level2=[]
        for entity_id in df_0[entity_string][i]:
            if len(entity_df[entity_df['ID'] == entity_id]['Level-2 disciplines']) == 0:
                entity_level2=[] 
            else:
                entity_level2=entity_df[entity_df['ID'] == entity_id]['Level-2 disciplines'].iloc[0]
            level2=level2 + [entity_level2]
        level2_col=level2_col+[level2]
    df_0['Level-2 disciplines '+ entity_string]=level2_col
#     df_0['Unique level-2 disciplines '+ entity_string]=df_0['Level-2 disciplines '+ entity_string].apply(lambda x: unique(x))

def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    # print list 
    return unique_list

# From disciplines of organisations, projects or publications to the (weighted) aggregate discipline vector      
def disciplines_to_weighted_aggregate_vector(list_of_lists, discipline_codes, size_and_duration = None):
    list_of_vectors=[]
    for i in range(len(list_of_lists)):
        vector = make_discipline_vector(list_of_lists[i],discipline_codes)
        if size_and_duration is None:
            weight=1
        else:
            size = size_and_duration[i][0]
            duration = size_and_duration[i][1]
            weight = duration/size 
        list_of_vectors = list_of_vectors+[(vector,weight)]
    aggregate_vector = add_vectors_with_weights(list_of_vectors)
    return aggregate_vector

# function takes a list with (non-unique) codes and gives a discipline vector (np.array)
def make_discipline_vector(level2_list, discipline_codes):
    disc_vector = np.array([0]*len(discipline_codes))
    for i in range(len(discipline_codes)):
        disc_vector[i]=level2_list.count(discipline_codes[i])
        
    if sum(disc_vector)>0:
        normalized_vector=disc_vector/sum(disc_vector)
    else:
        normalized_vector=disc_vector
    return normalized_vector

def add_vectors_with_weights(list_of_vectors_and_weights, size=42):
    total_vector=np.zeros(size)
    for (vector,weight) in list_of_vectors_and_weights:
        total_vector=total_vector + vector*weight
        
    if sum(total_vector)>0:
        normalized_vector=total_vector/sum(total_vector)
    else:
        normalized_vector=total_vector
    return normalized_vector


In [11]:
number='0'
cur_path = os.getcwd()
filename =number+ 'projects_prepared.csv'
new_path = os.path.relpath('..\\0_Preparing_dataframes/'+filename, cur_path)
projects = pd.read_csv(new_path)


projects['Persons'] = projects['Persons'].apply(lambda x: unique(literal_eval(x)))


In [12]:
number='0'
cur_path = os.getcwd()
filename =number+ 'persons_prepared.csv'
new_path = os.path.relpath('..\\0_Preparing_dataframes/'+filename, cur_path)
persons = pd.read_csv(new_path)
persons['Level-2 disciplines'] = persons['Level-2 disciplines'].apply(lambda x: literal_eval(x))
persons['Profile']=persons['Level-2 disciplines'].apply(lambda x: disciplines_to_weighted_aggregate_vector([x],discipline_codes))

In [13]:
projects

Unnamed: 0,ID,Name,Start,End,Disciplines,Persons,Organisations,Publications,Alias,when_added,Duration,Level-2 disciplines,Size
0,fdde0090-f6ed-4938-91a6-13d370125cde,ICP Interuniversity Programme Physical land re...,2015-09-22,2016-09-30,"['04010303', '01070405', '01050313']",[b73e0e47-ad11-4ce7-a065-2993e6c3b602],"['2b317cdb-3a20-47d7-b2bd-75aa36eb9734', 'f28f...",[],[],0,12,"['0401', '0107', '0105']",1
1,0f80da13-5ac3-45c0-a0df-c2bd93051dbe,ERC Professorship GlycoTarget,2014-03-01,2019-02-28,"['010608', '030130', '030116', '030617', '0306...",[8834112c-33f1-4ee6-8a34-9c09a7b71535],['adbfa1d6-6a4b-4abf-8dc4-ea71c647af26'],[],[],0,60,"['0106', '0301', '0301', '0306', '0306', '0301...",1
2,0e185133-3ddf-4f8a-81d6-2ddc76a49cb6,EU Promotion of Democratic Governance via Func...,2016-07-01,2017-01-31,"['050604', '050602', '050699', '050607', '0506...","[f19619e2-340d-47ef-9792-35d2436febc4, e011bd3...",['45ff2fc2-f78b-489c-ae38-b6fca2022521'],[],[],0,7,"['0506', '0506', '0506', '0506', '0506', '0506...",2
3,fc7f8873-326f-41fe-9c75-08ea05d8058b,Somite development in 3D in vitro synthetic mi...,2018-01-01,2018-12-31,['010699'],[05e00835-520a-4d3b-8e53-fbe03c1e43c7],['b9b3c61e-650e-4463-a0e2-6de8f49917ba'],[],[],0,12,['0106'],1
4,9be600ee-e306-4fac-8c4b-0e5134b9ff28,Housing for Refugee Inclusion exploring inclus...,2020-01-01,2023-12-31,"['02011199', '05040413', '05040702', '05060199...",[3de615bf-4167-4fa5-a4d3-0dac8248798a],['3420d399-29f4-4edf-9f9d-252e7b1d400d'],[],[],0,48,"['0201', '0504', '0504', '0506', '0604']",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,5177ba5f-64c4-4e9e-bf4e-d4eb5e4f25b9,Synthesis of benzodiocane ligin for applicatio...,2016-10-01,2019-09-30,"['020501', '020799', '01061007', '010606', '02...",[6c4f337a-6fac-41fc-9224-dc0ccbdeeebb],['37e63935-be43-4a88-8f83-97da12647a08'],[],[],1,36,"['0205', '0207', '0106', '0106', '0206', '0106...",1
1192,c641efba-1526-4018-80ba-2de9152d0e94,The soil systems under global change SOGLO,2012-10-01,2017-12-31,"['040103', '010704', '010503', '010604', '0107...","[bdf09bca-071b-4682-be43-6c47560ecab3, 47582bc...","['17cf03fe-441e-444b-87f4-62214f211431', 'f414...",[],['fce55ffb-6026-455d-93c0-9389e0f52164'],1,63,"['0401', '0107', '0105', '0106', '0107', '0107...",2
1193,6a2353c9-539f-47da-bc58-65f961a94d47,Congo basin integrated monitoring for forest c...,2010-12-15,2016-12-31,"['010601', '030505', '010401', '010606', '0106...","[9d445952-3146-4bce-8477-b3b681cf0b1f, bdf09bc...","['baf2328c-ac14-4990-8086-1126ac6cea11', '17cf...",[],['3ce1811f-5c7a-4854-8d8a-67e018819947'],1,70,"['0106', '0305', '0104', '0106', '0106', '0106...",3
1194,165ae5f2-aac4-4893-8597-240e351c60c7,ARBOREF integrated 39 lignin first 39 biorefinery,2015-04-01,2019-03-31,"['020602', '040102', '059999', '020404', '0106...","[a190c1ac-b82c-477b-8b16-39a5752d3e79, 6c4f337...","['b58d2d09-7716-4627-851a-d1608fec639f', '37e6...",[],['c0c71b56-d67e-4445-870c-b6b7e9b0da77'],1,48,"['0206', '0401', '0599', '0204', '0106', '0206...",4


In [14]:
colab=np.loadtxt('colab_matrix.txt')
colab[2,5]+5

22.596540452215667

In [20]:
colab_matrix=np.zeros((42,42))
colab_matrix = np.loadtxt('colab_matrix.txt')
for i in range(1000):
    per_ids=projects.loc[i, 'Persons']
    if len(per_ids)<2:
        continue
    per_vectors=[]
    for per_id in per_ids:
        if len(persons[persons['ID'] == per_id]['Profile']) == 0:
            print('Cannot find ' + ': ', per_id)
        else:
            vector=persons[persons['ID'] == per_id]['Profile'].iloc[0]
            per_vectors = per_vectors +[vector]
            
    combos = list(combinations(per_vectors, 2))
    for (p1,p2) in combos:
        for i in range(42):
            colab_matrix[i,i]=colab_matrix[i,i]+p1[i]*p2[i]
            for j in range(i):
                colab_matrix[i,j] = colab_matrix[i,j] + p1[i]*p2[j] + p1[j]*p2[i]
                colab_matrix[j,i] = colab_matrix[i,j]

In [21]:
pd.DataFrame(colab_matrix).round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,77.72,77.85,35.17,11.24,7.06,31.1,5.37,4.9,15.12,18.24,...,0.25,1.71,1.39,0.67,4.63,3.18,12.4,3.41,0.11,0.81
1,77.85,204.67,25.58,29.54,9.41,84.94,13.54,4.31,18.76,120.99,...,1.18,1.49,6.57,0.73,3.95,35.99,9.17,7.07,2.0,0.0
2,35.17,25.58,251.24,122.28,19.07,31.86,13.39,6.05,5.87,81.38,...,0.5,0.07,1.03,0.03,0.85,0.1,27.17,1.64,0.09,3.33
3,11.24,29.54,122.28,355.1,16.61,149.92,52.93,10.62,11.43,72.96,...,0.16,0.87,0.0,0.94,4.55,0.0,1.18,6.38,1.3,0.08
4,7.06,9.41,19.07,16.61,53.17,50.06,43.06,1.14,22.95,13.45,...,0.3,8.4,0.0,0.17,22.41,0.27,0.11,12.13,1.08,0.46
5,31.1,84.94,31.86,149.92,50.06,473.53,164.32,11.74,10.52,83.44,...,0.91,2.58,11.45,1.14,6.39,0.49,6.95,7.24,0.53,6.23
6,5.37,13.54,13.39,52.93,43.06,164.32,64.98,3.07,18.02,22.23,...,1.18,10.41,5.99,1.68,16.76,0.85,0.5,16.48,0.7,1.05
7,4.9,4.31,6.05,10.62,1.14,11.74,3.07,0.33,0.64,3.67,...,0.23,0.21,0.08,0.22,0.18,0.0,2.81,0.73,0.03,0.0
8,15.12,18.76,5.87,11.43,22.95,10.52,18.02,0.64,202.07,26.44,...,4.57,12.39,2.1,4.14,17.39,1.22,2.67,98.56,3.64,0.6
9,18.24,120.99,81.38,72.96,13.45,83.44,22.23,3.67,26.44,340.03,...,2.86,2.66,6.04,0.76,1.49,3.06,0.46,14.31,0.79,0.0


In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
# np.set_printoptions(threshold=False)
colab_matrix

In [None]:
list(combinations([5,9,'rr','8f'], 2))

In [None]:
np.diag(colab_matrix).mean()

In [None]:
i=24
i-(i // 10)*10 == 9