# Import packages

In [719]:
import pandas as pd

In [720]:
#import plotly
#import datapane as dp
#plotly.offline.init_notebook_mode(connected=True)

In [721]:
import re
import unicodedata

import pickle

import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Upload json files

In [722]:
#files= ['0.json', '1.json', '2.json', '3.json', '4.json','42.json', '15693.json']
files = ['0.json']
# create an empty list to store the DataFrames
dfsList = [pd.read_json(file, orient='records') for file in files]

# combine all DataFrames into a single DataFrame
crossrefDF = pd.concat(dfsList, ignore_index=True)


In [723]:
crossrefDF

Unnamed: 0,items
0,{'abstract': '<jats:p>The article describes th...
1,"{'URL': 'http://dx.doi.org/10.36770/bp.737', '..."
2,{'abstract': '<jats:p>Purpose: It is estimated...
3,{'abstract': '<jats:p>Introduction. The subjec...
4,{'abstract': '<jats:p>Climate change is a chal...
...,...
4995,{'abstract': '<jats:p>University is a remarkab...
4996,{'URL': 'http://dx.doi.org/10.1161/jaha.122.02...
4997,{'abstract': '<jats:p> The use of metaphors in...
4998,{'abstract': '<jats:title>Abstract</jats:title...


# Data preparation 

In [724]:
noAuthors = [i for i in range(len(crossrefDF)) if 'author' not in crossrefDF['items'][i]]

Authors = [i for i in range(len(crossrefDF)) if 'author'  in crossrefDF['items'][i]]

In [725]:
len(noAuthors) + len(Authors) == len(crossrefDF)

True

## Rows with authors

In [726]:
crossrefAuth = crossrefDF.iloc[Authors].copy()

crossrefAuth.reset_index(inplace= True)
crossrefAuth.drop(columns = ['index'], inplace = True)


## Extract 'DOI'

In [727]:
crossrefAuth.loc[:, 'DOI'] = crossrefAuth['items'].apply(lambda x: x['DOI'])

In [728]:
crossrefAuth.head()

Unnamed: 0,items,DOI
0,{'abstract': '<jats:p>The article describes th...,10.36770/bp.723
1,"{'URL': 'http://dx.doi.org/10.36770/bp.737', '...",10.36770/bp.737
2,{'abstract': '<jats:p>Purpose: It is estimated...,10.36948/ijfmr.2022.v04i06.1212
3,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11
4,{'abstract': '<jats:p>Climate change is a chal...,10.3390/agronomy13010117


## Extract 'authors' --- number of authors

In [729]:
crossrefAuth.loc[:,'authors'] = crossrefAuth['items'].apply(lambda x: x['author'])


In [730]:
crossrefAuth.head()

Unnamed: 0,items,DOI,authors
0,{'abstract': '<jats:p>The article describes th...,10.36770/bp.723,[{'ORCID': 'http://orcid.org/0000-0003-1468-38...
1,"{'URL': 'http://dx.doi.org/10.36770/bp.737', '...",10.36770/bp.737,[{'ORCID': 'http://orcid.org/0000-0003-0289-37...
2,{'abstract': '<jats:p>Purpose: It is estimated...,10.36948/ijfmr.2022.v04i06.1212,"[{'given': 'Minaxi Zala', 'family': '-', 'sequ..."
3,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...
4,{'abstract': '<jats:p>Climate change is a chal...,10.3390/agronomy13010117,"[{'given': 'Daniela', 'family': 'Soares', 'seq..."


In [731]:
numAuthors = [len(crossrefAuth.iloc[i]['authors']) for i in range(len(crossrefAuth))]

In [732]:
## yparxoun lathi  ---> kalytera number of affiliations
crossrefAuth.loc[:,'# authors'] = numAuthors

In [733]:
crossrefAuth.head()

Unnamed: 0,items,DOI,authors,# authors
0,{'abstract': '<jats:p>The article describes th...,10.36770/bp.723,[{'ORCID': 'http://orcid.org/0000-0003-1468-38...,1
1,"{'URL': 'http://dx.doi.org/10.36770/bp.737', '...",10.36770/bp.737,[{'ORCID': 'http://orcid.org/0000-0003-0289-37...,2
2,{'abstract': '<jats:p>Purpose: It is estimated...,10.36948/ijfmr.2022.v04i06.1212,"[{'given': 'Minaxi Zala', 'family': '-', 'sequ...",1
3,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2
4,{'abstract': '<jats:p>Climate change is a chal...,10.3390/agronomy13010117,"[{'given': 'Daniela', 'family': 'Soares', 'seq...",3


## Extract 'affiliations' --- number of affiliations

In [734]:
def getAff(k):
   return [crossrefAuth['authors'][k][j]['affiliation'] for j in range(len(crossrefAuth['authors'][k]))]
    

In [735]:
Affiliations = [getAff(k) for k in range(len(crossrefAuth))]

crossrefAuth.loc[:,'affiliations'] = Affiliations


In [736]:
numAffil = [len(Affiliations[i]) for i in range(len(crossrefAuth))]

In [737]:
crossrefAuth.loc[:,'# Affil'] = numAffil

In [738]:
crossrefAuth.head()

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil
0,{'abstract': '<jats:p>The article describes th...,10.36770/bp.723,[{'ORCID': 'http://orcid.org/0000-0003-1468-38...,1,[[]],1
1,"{'URL': 'http://dx.doi.org/10.36770/bp.737', '...",10.36770/bp.737,[{'ORCID': 'http://orcid.org/0000-0003-0289-37...,2,"[[], []]",2
2,{'abstract': '<jats:p>Purpose: It is estimated...,10.36948/ijfmr.2022.v04i06.1212,"[{'given': 'Minaxi Zala', 'family': '-', 'sequ...",1,[[]],1
3,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2
4,{'abstract': '<jats:p>Climate change is a chal...,10.3390/agronomy13010117,"[{'given': 'Daniela', 'family': 'Soares', 'seq...",3,"[[], [], []]",3


## Clean 'empty' affiliations

In [739]:
possibleEmptyAff = []

for k in range(len(crossrefAuth)):
    if len(crossrefAuth['affiliations'][k][0]) == 0:
        possibleEmptyAff.append(k)

In [740]:
len(possibleEmptyAff)

2655

In [741]:
crossrefAuth['affiliations'].iloc[2611]

[[], [], []]

In [742]:
nonEmptyAff = []

for k in possibleEmptyAff:
    for j in range(len(crossrefAuth['affiliations'].iloc[k])):
        if len(crossrefAuth['affiliations'].iloc[k][j]) != 0:
            nonEmptyAff.append(k)
    
    

In [743]:

FinalEmptyyAff=  [x for x in possibleEmptyAff if x not in nonEmptyAff] 

In [744]:
FinalNonEmptyAff = [x for x in range(len(crossrefAuth)) if x not in FinalEmptyyAff]

# affilDF: crossrefAuth subdataframe with nonpempty affiliation lists

In [745]:
affilDF = crossrefAuth.iloc[FinalNonEmptyAff].copy()
affilDF.reset_index(inplace = True)
affilDF.drop(columns = ['index'], inplace = True)

## (still some cleaning: cases with empty brackets [{}])

In [746]:
affilDF[affilDF['DOI'] == '10.48130/emst-2022-0020']

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil


In [747]:
for k in range(len(affilDF)):
    if len(affilDF['affiliations'][k][0]) != 0 and affilDF['affiliations'][k][0][0] == {}:
        print(k)

In [748]:
emptyBrackets = [k for k in range(len(affilDF)) if len(affilDF['affiliations'][k][0]) != 0 and affilDF['affiliations'][k][0][0] == {}]

In [749]:
affilDF.iloc[emptyBrackets]

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil


In [750]:
affilDF.copy()

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil
0,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2
1,{'abstract': '<jats:p>The article analyzes the...,10.37405/1729-7206.2022.1(42).135-147,[{'ORCID': 'http://orcid.org/0000-0002-1205-37...,3,"[[], [{'name': 'Institute of Industrial Econom...",3
2,{'URL': 'http://dx.doi.org/10.36718/1819-4036-...,10.36718/1819-4036-2022-11-40-46,"[{'given': 'Lyudmila Ivanovna', 'family': 'Yak...",2,[[{'name': 'Northern Trans-Ural State Agricult...,2
3,{'abstract': '<jats:p>Introduction. The materi...,10.36887/2524-0455-2020-2-16,[{'ORCID': 'http://orcid.org/0000-0001-5427-79...,1,[[{'name': 'Mykolayiv National Agrarian Univer...,1
4,{'abstract': '<jats:title>Abstract</jats:title...,10.1093/mnras/stac3785,"[{'given': 'Konstantin', 'family': 'Karchev', ...",3,[[{'name': 'Theoretical and Scientific Data Sc...,3
...,...,...,...,...,...,...
1591,{'abstract': '<jats:p>Salmonella enterica is a...,10.1155/2022/4567817,[{'ORCID': 'http://orcid.org/0000-0002-6399-86...,3,"[[{'name': 'Graduate School, University of the...",3
1592,{'abstract': '<jats:p>Background. The applicat...,10.1155/2022/6359841,[{'ORCID': 'http://orcid.org/0000-0002-0010-31...,6,"[[{'name': 'Department of Endodontics, School ...",6
1593,"{'URL': 'http://dx.doi.org/10.1111/tpj.16088',...",10.1111/tpj.16088,"[{'given': 'Marina', 'family': 'Martín‐Dacal',...",14,[[{'name': 'Centro de Biotecnología y Genómica...,14
1594,{'URL': 'http://dx.doi.org/10.1161/jaha.122.02...,10.1161/jaha.122.028501,[{'ORCID': 'http://orcid.org/0000-0001-7214-34...,3,[[{'name': 'Division of Cardiovascular Disease...,3


In [751]:
affilDF.drop(emptyBrackets, inplace = True)

In [752]:
affilDF.reset_index(inplace = True)

In [753]:
affilDF.copy()

Unnamed: 0,index,items,DOI,authors,# authors,affiliations,# Affil
0,0,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2
1,1,{'abstract': '<jats:p>The article analyzes the...,10.37405/1729-7206.2022.1(42).135-147,[{'ORCID': 'http://orcid.org/0000-0002-1205-37...,3,"[[], [{'name': 'Institute of Industrial Econom...",3
2,2,{'URL': 'http://dx.doi.org/10.36718/1819-4036-...,10.36718/1819-4036-2022-11-40-46,"[{'given': 'Lyudmila Ivanovna', 'family': 'Yak...",2,[[{'name': 'Northern Trans-Ural State Agricult...,2
3,3,{'abstract': '<jats:p>Introduction. The materi...,10.36887/2524-0455-2020-2-16,[{'ORCID': 'http://orcid.org/0000-0001-5427-79...,1,[[{'name': 'Mykolayiv National Agrarian Univer...,1
4,4,{'abstract': '<jats:title>Abstract</jats:title...,10.1093/mnras/stac3785,"[{'given': 'Konstantin', 'family': 'Karchev', ...",3,[[{'name': 'Theoretical and Scientific Data Sc...,3
...,...,...,...,...,...,...,...
1591,1591,{'abstract': '<jats:p>Salmonella enterica is a...,10.1155/2022/4567817,[{'ORCID': 'http://orcid.org/0000-0002-6399-86...,3,"[[{'name': 'Graduate School, University of the...",3
1592,1592,{'abstract': '<jats:p>Background. The applicat...,10.1155/2022/6359841,[{'ORCID': 'http://orcid.org/0000-0002-0010-31...,6,"[[{'name': 'Department of Endodontics, School ...",6
1593,1593,"{'URL': 'http://dx.doi.org/10.1111/tpj.16088',...",10.1111/tpj.16088,"[{'given': 'Marina', 'family': 'Martín‐Dacal',...",14,[[{'name': 'Centro de Biotecnología y Genómica...,14
1594,1594,{'URL': 'http://dx.doi.org/10.1161/jaha.122.02...,10.1161/jaha.122.028501,[{'ORCID': 'http://orcid.org/0000-0001-7214-34...,3,[[{'name': 'Division of Cardiovascular Disease...,3


In [754]:
affilDF.drop(columns = ['index'], inplace = True)

In [755]:
affilDF

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil
0,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2
1,{'abstract': '<jats:p>The article analyzes the...,10.37405/1729-7206.2022.1(42).135-147,[{'ORCID': 'http://orcid.org/0000-0002-1205-37...,3,"[[], [{'name': 'Institute of Industrial Econom...",3
2,{'URL': 'http://dx.doi.org/10.36718/1819-4036-...,10.36718/1819-4036-2022-11-40-46,"[{'given': 'Lyudmila Ivanovna', 'family': 'Yak...",2,[[{'name': 'Northern Trans-Ural State Agricult...,2
3,{'abstract': '<jats:p>Introduction. The materi...,10.36887/2524-0455-2020-2-16,[{'ORCID': 'http://orcid.org/0000-0001-5427-79...,1,[[{'name': 'Mykolayiv National Agrarian Univer...,1
4,{'abstract': '<jats:title>Abstract</jats:title...,10.1093/mnras/stac3785,"[{'given': 'Konstantin', 'family': 'Karchev', ...",3,[[{'name': 'Theoretical and Scientific Data Sc...,3
...,...,...,...,...,...,...
1591,{'abstract': '<jats:p>Salmonella enterica is a...,10.1155/2022/4567817,[{'ORCID': 'http://orcid.org/0000-0002-6399-86...,3,"[[{'name': 'Graduate School, University of the...",3
1592,{'abstract': '<jats:p>Background. The applicat...,10.1155/2022/6359841,[{'ORCID': 'http://orcid.org/0000-0002-0010-31...,6,"[[{'name': 'Department of Endodontics, School ...",6
1593,"{'URL': 'http://dx.doi.org/10.1111/tpj.16088',...",10.1111/tpj.16088,"[{'given': 'Marina', 'family': 'Martín‐Dacal',...",14,[[{'name': 'Centro de Biotecnología y Genómica...,14
1594,{'URL': 'http://dx.doi.org/10.1161/jaha.122.02...,10.1161/jaha.122.028501,[{'ORCID': 'http://orcid.org/0000-0001-7214-34...,3,[[{'name': 'Division of Cardiovascular Disease...,3


# Clean affiliations 

## is_contained(a,b) map : returns true when a is a substring of b 

In [756]:
def is_contained(s, w):
    words = s.split()  # Split the string 's' into a list of words
    for word in words:
        if word not in w:  # If a word from 's' is not found in 'w'
            return False  # Return False immediately
    return True  # If all words from 's' are found in 'w', return True

## 1. "Unique" affiliations --- number of unique affiliations

In [757]:
uniqueAff = []
error_indices = []  # New list to store error indices

for i in range(len(affilDF)):
    try:
        uniqueAff.append(list(set([x[0] for x in [list(d.values()) for d in [item for sublist in affilDF['affiliations'].iloc[i] for item in sublist]]])))
    except TypeError:
        print("Error occurred for i =", i)
        error_indices.append(i)  # Save the index where the error occurred

# Print the error indices
print("Error indices:", error_indices)


Error occurred for i = 13
Error occurred for i = 85
Error occurred for i = 86
Error occurred for i = 87
Error occurred for i = 248
Error occurred for i = 405
Error occurred for i = 919
Error occurred for i = 922
Error occurred for i = 950
Error occurred for i = 951
Error occurred for i = 953
Error occurred for i = 1236
Error occurred for i = 1365
Error occurred for i = 1366
Error occurred for i = 1367
Error occurred for i = 1368
Error occurred for i = 1369
Error indices: [13, 85, 86, 87, 248, 405, 919, 922, 950, 951, 953, 1236, 1365, 1366, 1367, 1368, 1369]


### affiliations having already ids

In [758]:
readyDF = affilDF.iloc[error_indices]

In [759]:
readyDF = readyDF[['DOI', 'affiliations']]

In [760]:
readyDF.reset_index(inplace = True)

### affiliations without any ids

In [761]:
affilDF.drop(error_indices, inplace = True)

In [762]:
affilDF.reset_index(inplace = True)

In [763]:
affilDF

Unnamed: 0,index,items,DOI,authors,# authors,affiliations,# Affil
0,0,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2
1,1,{'abstract': '<jats:p>The article analyzes the...,10.37405/1729-7206.2022.1(42).135-147,[{'ORCID': 'http://orcid.org/0000-0002-1205-37...,3,"[[], [{'name': 'Institute of Industrial Econom...",3
2,2,{'URL': 'http://dx.doi.org/10.36718/1819-4036-...,10.36718/1819-4036-2022-11-40-46,"[{'given': 'Lyudmila Ivanovna', 'family': 'Yak...",2,[[{'name': 'Northern Trans-Ural State Agricult...,2
3,3,{'abstract': '<jats:p>Introduction. The materi...,10.36887/2524-0455-2020-2-16,[{'ORCID': 'http://orcid.org/0000-0001-5427-79...,1,[[{'name': 'Mykolayiv National Agrarian Univer...,1
4,4,{'abstract': '<jats:title>Abstract</jats:title...,10.1093/mnras/stac3785,"[{'given': 'Konstantin', 'family': 'Karchev', ...",3,[[{'name': 'Theoretical and Scientific Data Sc...,3
...,...,...,...,...,...,...,...
1574,1591,{'abstract': '<jats:p>Salmonella enterica is a...,10.1155/2022/4567817,[{'ORCID': 'http://orcid.org/0000-0002-6399-86...,3,"[[{'name': 'Graduate School, University of the...",3
1575,1592,{'abstract': '<jats:p>Background. The applicat...,10.1155/2022/6359841,[{'ORCID': 'http://orcid.org/0000-0002-0010-31...,6,"[[{'name': 'Department of Endodontics, School ...",6
1576,1593,"{'URL': 'http://dx.doi.org/10.1111/tpj.16088',...",10.1111/tpj.16088,"[{'given': 'Marina', 'family': 'Martín‐Dacal',...",14,[[{'name': 'Centro de Biotecnología y Genómica...,14
1577,1594,{'URL': 'http://dx.doi.org/10.1161/jaha.122.02...,10.1161/jaha.122.028501,[{'ORCID': 'http://orcid.org/0000-0001-7214-34...,3,[[{'name': 'Division of Cardiovascular Disease...,3


In [764]:
affilDF.drop(columns = ['index'], inplace = True)

In [765]:
affilDF.loc[:,'uniqueAff'] = uniqueAff

In [766]:
numUniqueAff = [len(affilDF['uniqueAff'].iloc[i]) for i in range(len(affilDF))]

In [767]:
affilDF.loc[:,'# uniqueAff'] = numUniqueAff

In [768]:
affilDF.head()

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil,uniqueAff,# uniqueAff
0,{'abstract': '<jats:p>Introduction. The subjec...,10.36887/2524-0455-2020-3-11,[{'ORCID': 'http://orcid.org/0000-0003-2245-35...,2,[[{'name': 'Odessa National Polytechnic Univer...,2,[Odessa National Polytechnic University],1
1,{'abstract': '<jats:p>The article analyzes the...,10.37405/1729-7206.2022.1(42).135-147,[{'ORCID': 'http://orcid.org/0000-0002-1205-37...,3,"[[], [{'name': 'Institute of Industrial Econom...",3,[Dnipropetrovsk State University of Internal A...,2
2,{'URL': 'http://dx.doi.org/10.36718/1819-4036-...,10.36718/1819-4036-2022-11-40-46,"[{'given': 'Lyudmila Ivanovna', 'family': 'Yak...",2,[[{'name': 'Northern Trans-Ural State Agricult...,2,[Northern Trans-Ural State Agricultural Univer...,1
3,{'abstract': '<jats:p>Introduction. The materi...,10.36887/2524-0455-2020-2-16,[{'ORCID': 'http://orcid.org/0000-0001-5427-79...,1,[[{'name': 'Mykolayiv National Agrarian Univer...,1,[Mykolayiv National Agrarian University],1
4,{'abstract': '<jats:title>Abstract</jats:title...,10.1093/mnras/stac3785,"[{'given': 'Konstantin', 'family': 'Karchev', ...",3,[[{'name': 'Theoretical and Scientific Data Sc...,3,[Theoretical and Scientific Data Science group...,3


## 2. Remove stop words ['from', 'the']

In [769]:
stopWords = ['from', 'the', 'From', 'The']

In [770]:
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopWords]
    return ' '.join(filtered_words)


# apply the function to the column  affilDF['uniqueAff'] to create column affilDF.loc[:,'uniqueAff1']

affilDF.loc[:,'uniqueAff1'] = affilDF['uniqueAff'].apply(lambda x: [remove_stop_words(s) for s in x])


## 3. Remove parenthesis 

In [771]:
def remove_parentheses(text):
   return re.sub(r'\([^()]*\)', '', text)

# apply the function to each list element of column affilDF['uniqueAff1'] to remove substrings inside parentheses

affilDF.loc[:,'uniqueAff1'] = affilDF['uniqueAff1'].apply(lambda x: [remove_parentheses(s) for s in x])


## 4. Remove @#$%characters and umlauts

In [772]:
def replace_umlauts(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
    return replaced_text

affNoSymbols = []

for i in range(len(list(affilDF['uniqueAff1']))):
    L = list(affilDF['uniqueAff1'])[i]
    for j in range(len(L)):
        L[j] = re.sub(r'[^\w\s,Α-Ωα-ωぁ-んァ-ン一-龯，;]', '', L[j])
        L[j] = replace_umlauts(L[j])
        L[j] = L[j].replace("  ", " ")
        
    affNoSymbols.append(L)



In [773]:
affNoSymbols = [[item for item in inner_list if item != "inc"] for inner_list in affNoSymbols]


In [774]:
affilDF['uniqueAff1'] = affNoSymbols


## 5. Check 'sub'-affiliations (affiliations that are contained in other affiliations of the same DOI)

In [775]:
newAff0 = []

for k in range(len(affilDF)):
    
    L2 = []
    for s1 in affilDF['uniqueAff1'].iloc[k]:
        is_substring = False
        for s2 in affilDF['uniqueAff1'].iloc[k]:
            if s1 != s2 and s1 in s2:
                is_substring = True
                break
        if not is_substring:
            L2.append(s1)
    newAff0.append(L2)

In [776]:
newAffList = [list(set(newAff0[k])) for k in range(len(newAff0))]

In [777]:
affilDF['uniqueAff2'] = newAffList

## 6. Split strings where ',' or ';' appears    | Apply .lower()

In [778]:
def substringsDict(string):
    split_strings = [re.sub(r'^[\s.]+|[\s.]+$', '', s.strip()) for s in re.split(r'[,;]', string)]
    dict_string = {}
    index = 0

    for value in split_strings:
        if value:
            dict_string[index] = value.lower()
            index += 1

    return dict_string


In [779]:
newAffkomma = []

for k in range(len(affilDF)):
    
    new_list = []
    for item in affilDF['uniqueAff2'].iloc[k]:
        new_list.append(substringsDict(item))


    newAffkomma.append(new_list)



In [780]:
for j in range(len(newAffkomma)):
    for y in newAffkomma[j]:
        if len(y)>1:
            for i in range(len(y)-1):
                if (is_contained('lab', y[i]) or is_contained('dep', y[i]) or is_contained('inst', y[i]) or is_contained('hosp', y[i]) or is_contained('school', y[i]),is_contained('fac', y[i])) and is_contained('univ', y[i+1]):
                    del y[i]
                elif is_contained('lab', y[i]) and (is_contained('college', y[i+1]) or is_contained('inst', y[i+1]) or is_contained('dep', y[i+1]) or is_contained('dep', y[i+1])):
                    del y[i]
                elif is_contained('dep', y[i]) and (is_contained('college', y[i+1]) or is_contained('inst', y[i+1])):
                    del y[i]
                elif is_contained('inst', y[i]) and (is_contained('dep', y[i+1]) or is_contained('acad', y[i+1]) or is_contained('hosp', y[i+1]) or is_contained('fac', y[i+1]) or is_contained('cent', y[i+1]) or is_contained('div', y[i+1])):
                    del y[i]
            #    elif y[i] in city_names+removeList:
             #       del y[i]

            

In [781]:
lightAff = []
for j in range(len(newAffkomma)):
    lightAffj = []
    for y in newAffkomma[j]:
        lightAffj.append(', '.join(list(y.values())))
    lightAff.append(lightAffj)

In [782]:
affilDF['lightAff'] = lightAff

In [783]:
removeList = ['university','research institute','laboratory' , 'universit','gmbh', 'inc', 'university of', 'research center', 
'university college','national institute of', 'school of medicine', "university school", 'graduate school of', 'graduate school of engineering', 
'institute of tropical medicine', 'institute of virology', 'faculty of medicine','laboratory', 'university park', 'institute of science']

city_names = ["Aberdeen", "Abilene", "Akron", "Albany", "Albuquerque", "Alexandria", "Allentown", "Amarillo", "Anaheim", "Anchorage", "Ann Arbor", "Antioch", "Apple Valley", "Appleton", "Arlington", "Arvada", "Asheville", "Athens", "Atlanta", "Atlantic City", "Augusta", "Aurora", "Austin", "Bakersfield", "Baltimore", "Barnstable", "Baton Rouge", "Beaumont", "Bel Air", "Bellevue", "Berkeley", "Bethlehem", "Billings", "Birmingham", "Bloomington", "Boise", "Boise City", "Bonita Springs", "Boston", "Boulder", "Bradenton", "Bremerton", "Bridgeport", "Brighton", "Brownsville", "Bryan", "Buffalo", "Burbank", "Burlington", "Cambridge", "Canton", "Cape Coral", "Carrollton", "Cary", "Cathedral City", "Cedar Rapids", "Champaign", "Chandler", "Charleston", "Charlotte", "Chattanooga", "Chesapeake", "Chicago", "Chula Vista", "Cincinnati", "Clarke County", "Clarksville", "Clearwater", "Cleveland", "College Station", "Colorado Springs", "Columbia", "Columbus", "Concord", "Coral Springs", "Corona", "Corpus Christi", "Costa Mesa", "Dallas", "Daly City", "Danbury", "Davenport", "Davidson County", "Dayton", "Daytona Beach", "Deltona", "Denton", "Denver", "Des Moines", "Detroit", "Downey", "Duluth", "Durham", "El Monte", "El Paso", "Elizabeth", "Elk Grove", "Elkhart", "Erie", "Escondido", "Eugene", "Evansville", "Fairfield", "Fargo", "Fayetteville", "Fitchburg", "Flint", "Fontana", "Fort Collins", "Fort Lauderdale", "Fort Smith", "Fort Walton Beach", "Fort Wayne", "Fort Worth", "Frederick", "Fremont", "Fresno", "Fullerton", "Gainesville", "Garden Grove", "Garland", "Gastonia", "Gilbert", "Glendale", "Grand Prairie", "Grand Rapids", "Grayslake", "Green Bay", "GreenBay", "Greensboro", "Greenville", "Gulfport-Biloxi", "Hagerstown", "Hampton", "Harlingen", "Harrisburg", "Hartford", "Havre de Grace", "Hayward", "Hemet", "Henderson", "Hesperia", "Hialeah", "Hickory", "High Point", "Hollywood", "Honolulu", "Houma", "Houston", "Howell", "Huntington", "Huntington Beach", "Huntsville", "Independence", "Indianapolis", "Inglewood", "Irvine", "Irving", "Jackson", "Jacksonville", "Jefferson", "Jersey City", "Johnson City", "Joliet", "Kailua", "Kalamazoo", "Kaneohe", "Kansas City", "Kennewick", "Kenosha", "Killeen", "Kissimmee", "Knoxville", "Lacey", "Lafayette", "Lake Charles", "Lakeland", "Lakewood", "Lancaster", "Lansing", "Laredo", "Las Cruces", "Las Vegas", "Layton", "Leominster", "Lewisville", "Lexington", "Lincoln", "Little Rock", "Long Beach", "Lorain", "Los Angeles", "Louisville", "Lowell", "Lubbock", "Macon", "Madison", "Manchester", "Marina", "Marysville", "McAllen", "McHenry", "Medford", "Melbourne", "Memphis", "Merced", "Mesa", "Mesquite", "Miami", "Milwaukee", "Minneapolis", "Miramar", "Mission Viejo", "Mobile", "Modesto", "Monroe", "Monterey", "Montgomery", "Moreno Valley", "Murfreesboro", "Murrieta", "Muskegon", "Myrtle Beach", "Naperville", "Naples", "Nashua", "Nashville", "New Bedford", "New Haven", "New London", "New Orleans", "New York", "New York City", "Newark", "Newburgh", "Newport News", "Norfolk", "Normal", "Norman", "North Charleston", "North Las Vegas", "North Port", "Norwalk", "Norwich", "Oakland", "Ocala", "Oceanside", "Odessa", "Ogden", "Oklahoma City", "Olathe", "Olympia", "Omaha", "Ontario", "Orange", "Orem", "Orlando", "Overland Park", "Oxnard", "Palm Bay", "Palm Springs", "Palmdale", "Panama City", "Pasadena", "Paterson", "Pembroke Pines", "Pensacola", "Peoria", "Philadelphia", "Phoenix", "Pittsburgh", "Plano", "Pomona", "Pompano Beach", "Port Arthur", "Port Orange", "Port Saint Lucie", "Port St. Lucie", "Portland", "Portsmouth", "Poughkeepsie", "Providence", "Provo", "Pueblo", "Punta Gorda", "Racine", "Raleigh", "Rancho Cucamonga", "Reading", "Redding", "Reno", "Richland", "Richmond", "Richmond County", "Riverside", "Roanoke", "Rochester", "Rockford", "Roseville", "Round Lake Beach", "Sacramento", "Saginaw", "Saint Louis", "Saint Paul", "Saint Petersburg", "Salem", "Salinas", "Salt Lake City", "San Antonio", "San Bernardino", "San Buenaventura", "San Diego", "San Francisco", "San Jose", "Santa Ana", "Santa Barbara", "Santa Clara", "Santa Clarita", "Santa Cruz", "Santa Maria", "Santa Rosa", "Sarasota", "Savannah", "Scottsdale", "Scranton", "Seaside", "Seattle", "Sebastian", "Shreveport", "Simi Valley", "Sioux City", "Sioux Falls", "South Bend", "South Lyon", "Spartanburg", "Spokane", "Springdale", "Springfield", "St. Louis", "St. Paul", "St. Petersburg", "Stamford", "Sterling Heights", "Stockton", "Sunnyvale", "Syracuse", "Tacoma", "Tallahassee", "Tampa", "Temecula", "Tempe", "Thornton", "Thousand Oaks", "Toledo", "Topeka", "Torrance", "Trenton", "Tucson", "Tulsa", "Tuscaloosa", "Tyler", "Utica", "Vallejo", "Vancouver", "Vero Beach", "Victorville", "Virginia Beach", "Visalia", "Waco", "Warren", "Washington", "Waterbury", "Waterloo", "West Covina", "West Valley City", "Westminster", "Wichita", "Wilmington", "Winston", "Winter Haven", "Worcester", "Yakima", "Yonkers", "York", "Youngstown"]

city_names = [x.lower() for x in city_names]

In [784]:
for j in range(len(newAffkomma)):
    for y in newAffkomma[j]:
        for i in list(y.keys()):

            if y[i] in city_names+removeList:
                del y[i]


In [785]:
affilDF['uniqueAff4'] =  [[list(d.values()) for d in sublist] for sublist in newAffkomma]


# Labels based on legalnames of openAIRE's organizations

In [786]:
uniList = ['institu', 'istituto','univ', 'college', 'center', 'centre' , 'cnrs', 'faculty','school' , 'academy' , 'école', 'hochschule' , 'ecole' ]

labList = ['lab']

hosplList = ['hospital' ,'clinic', 'hôpital']

gmbhList = ['gmbh', 'company' , 'industr', 'etaireia' , 'corporation', 'inc']

musList =  ['museum', 'library']

foundList =  ['foundation' , 'association','organization' ,'society', 'group' ]

deptList = ['district' , 'federation'  , 'government' , 'municipal' , 'county', 'ministry','council', 'agency']

unknownList = ['unknown']

#######   Dictionaries ##########

uniDict = {k: 'Univ/Inst' for k in uniList}   

labDict = {k: 'Laboratory' for k in labList} 

hosplDict = {k: 'Hospital' for k in hosplList}   

gmbhDict = {k: 'Company' for k in gmbhList}   

musDict = {k: 'Museum' for k in musList}   

#schoolDict = {k: 'School' for k in schoolList}   

foundDict = {k: 'Foundation' for k in foundList}   

deptDict = {k: 'Government' for k in deptList}   

unknownDict =  {k: 'Unknown' for k in unknownList}   

categDictsList = [uniDict, labDict, hosplDict, gmbhDict, musDict, #schoolDict, 
                  foundDict, deptDict, unknownDict]

################# Final Dictionary #####################

categDicts = {}
i = 0
while i in range(len(categDictsList)):
    categDicts.update(categDictsList[i])
    i = i+1
    
    


## affiliationsDict

In [787]:
affiliationsDict = {}

for i in range(len(affilDF)):
    affiliationsDict[i] = affilDF['uniqueAff4'].iloc[i]
    


In [788]:
d_new = {}

# iterate over the keys of affiliationsDict
for k in range(len(affiliationsDict)):
    mappedk = []
    # get the list associated with the current key in affiliationsDict
    L = affiliationsDict.get(k, [])
    
    for x in L:
        mapped_listx = [[s, v] for s in x for k2, v in categDicts.items() if k2 in s]
        mappedk.append(mapped_listx)
    
    # add the mapped list to the new dictionary d_new
    d_new[k] = mappedk

In [789]:
affilDF['Dictionary'] = d_new

## What is not included 

In [790]:
notInList = [i for i in range(len(affilDF)) if affilDF['Dictionary'].iloc[i] == [[]]]
    

In [791]:
affilDF.iloc[notInList]

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil,uniqueAff,# uniqueAff,uniqueAff1,uniqueAff2,lightAff,uniqueAff4,Dictionary
27,{'URL': 'http://dx.doi.org/10.30965/2589045x-1...,10.30965/2589045x-18802015,"[{'given': 'Heribert', 'family': 'Hallermann',...",1,[[{'name': 'Würzburg'}]],1,[Würzburg],1,[Wurzburg],[Wurzburg],[wurzburg],[[wurzburg]],[[]]
70,{'abstract': '<jats:p>Globalizacja zmienia swo...,10.5604/01.3001.0016.1962,"[{'given': 'Weronika', 'family': 'Jakubczak', ...",1,[[{'name': 'Szkoła Główna Służby Pożarniczej'}]],1,[Szkoła Główna Służby Pożarniczej],1,[Szkoła Głowna Słuzby Pozarniczej],[Szkoła Głowna Słuzby Pozarniczej],[szkoła głowna słuzby pozarniczej],[[szkoła głowna słuzby pozarniczej]],[[]]
82,{'abstract': '<jats:p>The introduction of poly...,10.6036/10715,[{'ORCID': 'http://orcid.org/0000-0001-5554-69...,5,"[[{'name': 'GAIKER (España)'}], [{'name': 'GAI...",5,[GAIKER (España)],1,[GAIKER ],[GAIKER ],[gaiker],[[gaiker]],[[]]
187,{'abstract': '<jats:title>Abstract</jats:title...,10.4049/jimmunol.109.5.1138,"[{'given': 'Leo', 'family': 'Levine', 'sequenc...",1,"[[{'name': 'Division of Biologic Laboratories,...",1,"[Division of Biologic Laboratories, State Labo...",1,"[Division of Biologic Laboratories, State Labo...","[Division of Biologic Laboratories, State Labo...","[massachusetts department of public health, bo...","[[massachusetts department of public health, m...",[[]]
246,{'abstract': '<jats:title>Summary</jats:title>...,10.4049/jimmunol.10.1.55,"[{'given': 'Charles', 'family': 'Krumwiede', '...",3,"[[{'name': 'Bureau of Laboratories, Department...",3,"[Bureau of Laboratories, Department of Health ...",1,"[Bureau of Laboratories, Department of Health ...","[Bureau of Laboratories, Department of Health ...","[department of health, new york city]",[[department of health]],[[]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,{'abstract': '<jats:title>Abstract</jats:title...,10.4049/jimmunol.117.5_part_1.1638,"[{'given': 'Bernard H.', 'family': 'Doft', 'se...",5,[[{'name': 'Division of Blood and Blood Produc...,5,[Division of Blood and Blood Products and the ...,1,[Division of Blood and Blood Products and Divi...,[Division of Blood and Blood Products and Divi...,[division of blood and blood products and divi...,[[division of blood and blood products and div...,[[]]
1543,{'abstract': '<jats:title>Abstract</jats:title...,10.4049/jimmunol.154.3.1207,"[{'given': 'M', 'family': 'Ticchioni', 'sequen...",8,[[{'name': 'National Institute of Health and M...,8,[National Institute of Health and Medical Rese...,1,[National Institute of Health and Medical Rese...,[National Institute of Health and Medical Rese...,"[faculty of medicine, nice, france]","[[nice, france]]",[[]]
1564,{'URL': 'http://dx.doi.org/10.33876/2782-3423/...,10.33876/2782-3423/2022-2/231-251,[{'ORCID': 'http://orcid.org/0000-0001-9988-95...,1,[[{'name': 'Институт этнологии и антропологии ...,1,[Институт этнологии и антропологии имени Н.Н. ...,1,[Институт этнологии и антропологии имени НН Ми...,[Институт этнологии и антропологии имени НН Ми...,[институт этнологии и антропологии имени нн ми...,[[институт этнологии и антропологии имени нн м...,[[]]
1569,"{'abstract': '<jats:p xml:lang=""en"">Introducti...",10.35206/jan.1045529,"[{'given': 'Karsten', 'family': 'MÜNSTEDT', 's...",1,[[{'name': 'Ortenau Klinikum Offenburg-Kehl'}]],1,[Ortenau Klinikum Offenburg-Kehl],1,[Ortenau Klinikum OffenburgKehl],[Ortenau Klinikum OffenburgKehl],[ortenau klinikum offenburgkehl],[[ortenau klinikum offenburgkehl]],[[]]


In [792]:
len(affilDF)  - len(notInList)

1495

In [793]:
len(notInList)

84

# affilDF1 ['DOI', 'affiliations', 'Dictionary','uniqueAff4', 'uniqueAff2','# authors','# uniqueAff']

In [794]:
affilDF1 = affilDF[['DOI', 'affiliations','lightAff','Dictionary','uniqueAff4', 'uniqueAff2','# authors','# uniqueAff']]

## New column: category based on the labels 

In [795]:
category = [', '.join(list(set([x[1] for y in affilDF1['Dictionary'].iloc[i] for x in y]))) for i in range(len(affilDF1))]
    


In [796]:
affilDF1 = affilDF1.copy()

In [797]:
affilDF1.loc[:, 'category'] = category


In [798]:
affilDF1.head(10)

Unnamed: 0,DOI,affiliations,lightAff,Dictionary,uniqueAff4,uniqueAff2,# authors,# uniqueAff,category
0,10.36887/2524-0455-2020-3-11,[[{'name': 'Odessa National Polytechnic Univer...,[odessa national polytechnic university],"[[[odessa national polytechnic university, Uni...",[[odessa national polytechnic university]],[Odessa National Polytechnic University],2,1,Univ/Inst
1,10.37405/1729-7206.2022.1(42).135-147,"[[], [{'name': 'Institute of Industrial Econom...",[dnipropetrovsk state university of internal a...,[[[dnipropetrovsk state university of internal...,[[dnipropetrovsk state university of internal ...,[Dnipropetrovsk State University of Internal A...,3,2,"Univ/Inst, Company"
2,10.36718/1819-4036-2022-11-40-46,[[{'name': 'Northern Trans-Ural State Agricult...,[northern transural state agricultural univers...,[[[northern transural state agricultural unive...,[[northern transural state agricultural univer...,[Northern TransUral State Agricultural Univers...,2,1,Univ/Inst
3,10.36887/2524-0455-2020-2-16,[[{'name': 'Mykolayiv National Agrarian Univer...,[mykolayiv national agrarian university],"[[[mykolayiv national agrarian university, Uni...",[[mykolayiv national agrarian university]],[Mykolayiv National Agrarian University],1,1,Univ/Inst
4,10.1093/mnras/stac3785,[[{'name': 'Theoretical and Scientific Data Sc...,[theoretical and scientific data science group...,[[[theoretical and scientific data science gro...,[[theoretical and scientific data science grou...,[Theoretical and Scientific Data Science group...,3,3,"Univ/Inst, Foundation, Company"
5,10.1093/mtomcs/mfac100,[[{'name': 'State Key Laboratory of Microbial ...,"[warsaw university of technology, 00664 warsaw...","[[[warsaw university of technology, Univ/Inst]...","[[warsaw university of technology, 00664 warsa...","[Chair of Analytical Chemistry, Warsaw Univers...",6,4,"Univ/Inst, Laboratory, Government, Company"
6,10.1093/jxb/erac520,[[{'name': 'Department of Ecology and Evolutio...,"[program in genetics and genomics, duke univer...","[[[duke university, Univ/Inst]], [[university ...","[[program in genetics and genomics, duke unive...","[Program in Genetics and Genomics, Department ...",2,2,Univ/Inst
7,10.1093/ijlct/ctac123,[[{'name': 'University Center Salhi Ahmed Naam...,[scientific and technical research center for ...,[[[scientific and technical research center fo...,[[scientific and technical research center for...,[Scientific and Technical Research Center for ...,7,8,"Univ/Inst, Laboratory, Company"
8,10.1080/27690911.2022.2157413,"[[{'name': 'School of Mathematical Sciences, H...","[heilongjiang university, harbin, heilongjiang...","[[[heilongjiang university, Univ/Inst]]]","[[heilongjiang university, harbin, heilongjian...","[School of Mathematical Sciences, Heilongjiang...",1,1,Univ/Inst
9,10.1097/ju.0000000000003138,"[[{'name': 'Duke University Medical Center, Di...","[duke university medical center, division of u...","[[[duke university medical center, Univ/Inst],...","[[duke university medical center, division of ...","[Duke University Medical Center, Division of U...",3,1,Univ/Inst


### new label: rest

In [799]:
affilDF1['category'].iloc[5]

'Univ/Inst, Laboratory, Government, Company'

In [800]:
for i in range(len(affilDF1)):
    if affilDF1['category'].iloc[i] == '':
        affilDF1.iloc[i, affilDF1.columns.get_loc('category')] = 'Rest'


In [801]:
affiliationsSimple = [
    list(set([inner_list[0] for outer_list in affilDF1['Dictionary'].iloc[i] for inner_list in outer_list]))
    for i in range(len(affilDF1))
]


In [802]:
affilDF1['affilSimple'] = affiliationsSimple

In [803]:
len(affilDF1[affilDF1['category'] == 'Rest'])

91

# UNIVS & LABS

In [804]:
univLabs = [i for i in range(len(affilDF1)) if 'Laboratory' in affilDF1['category'].iloc[i] 
            or 'Univ/Inst' in  affilDF1['category'].iloc[i]]

In [805]:

univLabsDF = affilDF1.iloc[univLabs].copy()

In [806]:
univLabsDF.reset_index(inplace = True)

In [807]:
univLabsDF.drop(columns = ['index'], inplace = True)

# Load files from openAIRE

In [808]:
#with open('dixOpenAIRE_Alletc.pkl', 'rb') as f:
#    dixOpenAIRE_Alletc = pickle.load(f)

#with open('dixOpenAIRE_id.pkl', 'rb') as f:
#    dixOpenAIRE_id = pickle.load(f)


In [809]:
with open('dixOpenOrgId.pkl', 'rb') as f:
    dixOpenOrgId = pickle.load(f)


In [810]:
def findID(name):
    lnames = []
    for x in list(dixOpenOrgId1.keys()):
        if name in x:
            lnames.append(x)
    return lnames

## Clean/modify the files

In [811]:
#dixOpenAIRE_Alletc1 =  {k.replace(',', ''): v for k, v in dixOpenAIRE_Alletc.items()}
#dixOpenAIRE_id1 = {k.replace(',', ''): v for k, v in dixOpenAIRE_id.items()}

In [812]:
dixOpenOrgId1 = {k.replace(',', ''): v for k, v in dixOpenOrgId.items()}


In [813]:
dixOpenOrgId1 = {k.replace("  ", ""): v for k, v in dixOpenOrgId1.items()}

In [814]:
for x in dixOpenOrgId1:
    if "  " in x:
        print(x)

In [815]:
dixOpenOrgId1 = {
    replace_umlauts(key): value
    for key, value in dixOpenOrgId1.items()
}


In [816]:
def filter_key(key):
    # Remove all non-alphanumeric characters except Greek letters and Chinese characters
    return re.sub(r'[^\w\s,Α-Ωα-ωぁ-んァ-ン一-龯，]', '', key)

def filter_dictionary_keys(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        filtered_key = filter_key(key)
        filtered_dict[filtered_key] = value
    return filtered_dict


#dixOpenAIRE_Alletc1 = filter_dictionary_keys(dixOpenAIRE_Alletc1)
#dixOpenAIRE_id1 = filter_dictionary_keys(dixOpenAIRE_id1)
dixOpenOrgId1 = filter_dictionary_keys(dixOpenOrgId1)



In [817]:
#del dixOpenAIRE_Alletc1['laboratory']
#del dixOpenAIRE_Alletc1['university hospital']

In [818]:
#del dixOpenAIRE_id1['laboratory']
#del dixOpenAIRE_id1['university hospital']

In [819]:
del dixOpenOrgId1['university hospital']

# MATCHINGS

## Helper functions

### Clean the matchings

In [820]:
def bestSimScore(l1, l2, l3):
    """
    Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
    ---> corrects special cases in the main map that follows

    Args:
        l1: List of light affiliations.
        l2: List of matched OpenAIRE names.
        l3: List of pairs.
        l4: mult

    Returns:
        List: Resulting list containing OpenAIRE names and their similarity scores.
    """
    
    vectorizer = CountVectorizer()
    numUniv = sum([(l1[i].lower()).count('univ') for i in range(len(l1))])
    result = []
    for i in range(len(l1)):
        best = [] 
        s = l1[i]
       # s_contains_university = is_contained("university", s.lower())  
        
       # if not is_contained("univ", s.lower()):
        #    continue  # Skip if s does not contain "university" or "univ"
        
    
        for j in range(len(l2)):
            x = l2[j]  
              
            if l3[j][2] >=0.99 and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or  is_contained("center", x.lower()) or  is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
                result.append([l2[j], 1])
                
            else:
        #        x_contains_university = is_contained("university", x.lower())
                if not is_contained("univ", x.lower()):
                    continue  # Skip if x does not contain "university" or "univ"
                
            
                s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
                x_vector = vectorizer.transform([x]).toarray()
              #  s_vector1 =  vectorizer.transform([s]).toarray()
              #  x_vector1 =  vectorizer.fit_transform([s]).toarray()
                

                # Compute similarity between the vectors
                similarity = cosine_similarity(x_vector, s_vector)[0][0]
               # similarity1 = cosine_similarity(x_vector1, s_vector1)[0][0]
                #similarity2 = Levenshtein.ratio(s,x)


                best.append([l2[j], similarity])#(similarity+similarity2)/2])
        
        if best:
            max_score = max(best, key=lambda x: x[1])[1]
            max_results = [(x[0], x[1]) for x in best if x[1] == max_score]
            if len(max_results) > 1:
                max_results.sort(key=lambda x: (l2.index(x[0]), -x[1]), reverse=False)
                result.append(max_results[-1])
            else:
                result.append(max_results[0])
                
    univ_list = []
    other_list = []
    
    for r in result:
        if is_contained('univ',r[0]):
            univ_list.append(r)
        else:
            other_list.append(r)
            
    if len(univ_list)> numUniv:
        result = univ_list[:numUniv] + other_list
                
    return result

### Find rows with multiple mathcings

In [821]:
def index_multipleMatchings(df):
    multipleMatchings = []
    mult = []

    for i in range(len(df)):
        result_dict = {}
        

        for t in [t[0] for t in df.Pairs.iloc[i]]:
            key = t
            if key in result_dict:
                result_dict[key] += 1
                multipleMatchings.append(i)
                
            else:
                result_dict[key] = 1
        mult.append(result_dict)
    return [list(set(multipleMatchings)), mult]
                
        

## Main map

In [822]:
def Doi_Ids(m, DF, dixOpenAIRE, simU, simG):
    
    """
    Matches affiliations in DataFrame 'DF' with names from dictionary 'dixOpenAIRE' and their openAIRE ids based on similarity scores.

    Args:
        m (int): The number of DOIs to check.
        DF (DataFrame): The input DataFrame containing affiliation data.
        dixOpenAIRE (dict): A dictionary of names from OpenAIRE.
        simU (float): Similarity threshold for universities.
        simG (float): Similarity threshold for non-universities.

    Returns:
        DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
    """
    
    lnamelist = list(dixOpenAIRE.keys())
    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
    deiktes = []  # stores indeces where a match is found
    vectorizer = CountVectorizer()
    similarity_ab = [] # stores lists of similarity scores of the mathces 
    pairs = [] #  pairs[i] =  [ [s,x,t] ] where (s,x) is a match and t the corresponding similarity score
    
    for k in range(m):
        similar_k = []
        pairs_k = []


        for s in DF['affilSimple'].iloc[k]:

            if s in lnamelist:
                deiktes.append(k)
                similarity = 1
                similar_k.append(similarity)
                
                pairs_k.append((s,s,similarity))

                if k not in dix:
                    dix[k] = [s]
                else:
                    dix[k].append(s)
            else:

                for x in lnamelist:
                    
                    if  is_contained(s, x):
                        x_vector = vectorizer.fit_transform([x]).toarray()
                        s_vector = vectorizer.transform([s]).toarray()

                        # Compute similarity between the vectors
                        similarity = cosine_similarity(x_vector, s_vector)[0][0]
                        if similarity > min(simU, simG):
                            if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
                                similar_k.append(similarity)
                                deiktes.append(k)
                                pairs_k.append((s,x,similarity))

                                if k not in dix:
                                    dix[k] = [x]
                                else:
                                    dix[k].append(x)
                            elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
                                similar_k.append(similarity)
                                deiktes.append(k)
                                pairs_k.append((s,x,similarity))

                                if k not in dix:
                                    dix[k] = [x]
                                else:
                                    dix[k].append(x)
                    elif is_contained(x, s):
                        if (is_contained('univ', s) and is_contained('univ', x)):

                            if 'and' in s:
                                list_s = s.split(' and ')
                                for t in list_s:
                                    if is_contained(x, t) and is_contained('univ', t):
                                        t_vector = vectorizer.fit_transform([t]).toarray()
                                        x_vector = vectorizer.transform([x]).toarray()

                            # Compute similarity between the vectors
                                        similarity = cosine_similarity(t_vector, x_vector)[0][0]
                                        if similarity > simU:
                                            similar_k.append(similarity)
                                            deiktes.append(k)
                                            pairs_k.append((s,x,similarity))

                                            if k not in dix:
                                                dix[k] = [x]
                                            else:
                                                dix[k].append(x)
                            
                            else: 
                                s_vector = vectorizer.fit_transform([s]).toarray()
                                x_vector = vectorizer.transform([x]).toarray()

                                # Compute similarity between the vectors
                                similarity = cosine_similarity(s_vector, x_vector)[0][0]
                                if similarity > 0.81: #max(0.82,sim):
                                    similar_k.append(similarity)
                                    deiktes.append(k)
                                    pairs_k.append((s,x,similarity))

                                    if k not in dix:
                                        dix[k] = [x]
                                    else:
                                        dix[k].append(x)
                        elif not is_contained('univ', s) and not is_contained('univ', x):
                            if 'and' in s:
                                list_s = s.split(' and ')
                                for t in list_s:
                                    if is_contained(x, t):
                                        t_vector = vectorizer.fit_transform([t]).toarray()
                                        x_vector = vectorizer.transform([x]).toarray()

                            # Compute similarity between the vectors
                                        similarity = cosine_similarity(t_vector, x_vector)[0][0]
                                        if similarity > simG:
                                            similar_k.append(similarity)
                                            deiktes.append(k)
                                            pairs_k.append((s,x,similarity))

                                            if k not in dix:
                                                dix[k] = [x]
                                            else:
                                                dix[k].append(x)
                            
                            else: 
                                s_vector = vectorizer.fit_transform([s]).toarray()
                                x_vector = vectorizer.transform([x]).toarray()

                                # Compute similarity between the vectors
                                similarity = cosine_similarity(s_vector, x_vector)[0][0]
                                if similarity > simG: #max(0.82,sim):
                                    similar_k.append(similarity)
                                    deiktes.append(k)
                                    pairs_k.append((s,x,similarity))

                                    if k not in dix:
                                        dix[k] = [x]
                                    else:
                                        dix[k].append(x)
                            
        similarity_ab.append(similar_k)   
        similarity_ab = [lst for lst in similarity_ab if lst != []]
        pairs.append(pairs_k)
        
    perc = 100*len(dix)/m
    
    dixDoiAff = {DF['DOI'].iloc[key]: value for key, value in dix.items()} # dictionary {DOI : legalnames} 
    
    #dixDoiPid1 = {key : [dixOpenAIRE[x] for x in value if x in  lnamelist] for key , value in dixDoiAff.items()}
    
   # dixDoiPid = {key : [dixOpenAIRE[x] for x in value] for key , value in dixDoiAff.items()} # dictionary {DOI : PIDs} 
    
    
    
## Define the new Dataframe
    
    doiIdDF = pd.DataFrame()
    doiIdDF['DOI'] = list(dixDoiAff.keys())
    doiIdDF['Affiliations'] = list(DF['affiliations'].iloc[list(set(deiktes))])

    doiIdDF['Unique affiliations'] = list(DF['uniqueAff2'].iloc[list(set(deiktes))])
    doiIdDF['light affiliations'] = list(DF['lightAff'].iloc[list(set(deiktes))])

    
    doiIdDF['# Authors'] = list(DF['# authors'].iloc[list(set(deiktes))])


    doiIdDF['# Unique affiliations'] = list(DF['# uniqueAff'].iloc[list(set(deiktes))])

    doiIdDF['Candidates for matching'] = list(DF['affilSimple'].iloc[list(set(deiktes))])

    doiIdDF['Matched openAIRE names'] = list(dix.values())
    doiIdDF['# Matched orgs'] = [len(list(dix.values())[i]) for i in range(len(list(dix.values())))]
    

    doiIdDF['Similarity score'] = similarity_ab
    perfectSim = [[1 if num >= 1 else 0 for num in inner_list] for inner_list in similarity_ab]

    doiIdDF['Perfect match'] = perfectSim
    perfectSimSum = [sum(x) for x in perfectSim]
    doiIdDF['Perfect sum'] = perfectSimSum
    Pairs = [lst for lst in pairs if lst]
    doiIdDF['Pairs'] = Pairs
    


## Correct the matchings
    
    needCheck = [i for i  in range(len(doiIdDF)) if doiIdDF['# Matched orgs'].iloc[i] - max(doiIdDF['# Authors'].iloc[i],doiIdDF['# Unique affiliations'].iloc[i]) >0 or    i in index_multipleMatchings(doiIdDF)[0]]

    ready = [i for i in range(len(doiIdDF)) if i not in needCheck]
   
    best = [ bestSimScore(doiIdDF['light affiliations'].iloc[i], doiIdDF['Matched openAIRE names'].iloc[i], doiIdDF['Pairs'].iloc[i]) for i in needCheck]
    best_o = []
    best_s = []
    
    for x in best:
        best_o.append([x[i][0]  for i in range(len(x))])
        best_s.append([round(x[i][1],2)  for i in range(len(x))])
    numMathced = [len(best_s[i]) for i in range(len(needCheck))]
    

    
    dfFinal0 = (doiIdDF.iloc[ready]).copy()
    dfFinal0['index'] = ready
    
    dfFinal1 = (doiIdDF.iloc[needCheck]).copy()
    dfFinal1['index'] = needCheck
    dfFinal1['Matched openAIRE names'] = best_o
    dfFinal1['Similarity score'] = best_s
    dfFinal1['# Matched orgs'] = numMathced
    
    finalDF =  pd.concat([dfFinal0, dfFinal1])
    finalDF.set_index('index', inplace=True)
    finalDF.sort_values('index', ascending=True, inplace = True)
    
    ids = [[dixOpenAIRE[x] for x in v] for v in finalDF['Matched openAIRE names']]
    numIds = [len(x) for x in ids]

    finalDF['IDs'] = ids
    finalDF['# IDs'] = numIds
    finalDF['mult'] = index_multipleMatchings(doiIdDF)[1]
# Assuming you want to remove rows where the 'column_name' meets a specific condition
    finalDF = finalDF[~(finalDF['# Matched orgs'] == 0)]
    
    finalDF = finalDF.reset_index(drop=True)


    
    
    return [perc, finalDF]#, doiIdDF, needCheck]
    


In [823]:
result = Doi_Ids(len(univLabsDF), univLabsDF, dixOpenOrgId1, 0.7,0.82)

In [824]:
result[0]

79.00629811056683

# HTML

In [826]:
finaldf = result[1]
#finalDF = finaldf[["DOI", "Unique affiliations", "light affiliations" ,'# Authors','# Unique affiliations', '# Matched orgs','Candidates for matching', 'Matched openAIRE names', 'mult', 'Similarity score' ]]


In [827]:
def update_Z(row):
    new_Z = []
    for i in range(len(row['IDs'])):
        entry = {'openaireId': row['IDs'][i], 'confidence': row['Similarity score'][i]}
        new_Z.append(entry)
    return new_Z

# Update the values in column 'Z' using 'apply'
finaldf['affiliations'] = finaldf.apply(update_Z, axis=1)


In [828]:
finaldf_output = finaldf[['DOI','affiliations']]
finaldf_output = finaldf_output.rename(columns={'DOI': 'doi'}).copy()



In [829]:
#app = dp.App(dp.Page(title="Matchings [explained]", blocks= [dp.Text('Matched '+str(result[0])+'%'), dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.Table(finalDF)]
#             ), 
#             dp.Page(title="Final matchings", blocks= [dp.Text('Matched '+str(result[0])+'%'), dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.Table(finaldf_output)]
#             )
#             )
    


   
#app.save(path="test.html", open=True)

# JSON

In [830]:
match0 = finaldf_output.to_json(orient='records')

# Save the JSON to a file
with open('match0.json', 'w') as f:
    f.write(match0)