# Import packages

In [1]:
import pandas as pd

In [2]:
import plotly
import datapane as dp
plotly.offline.init_notebook_mode(connected=True)

In [3]:
import re
import unicodedata

from collections import defaultdict

import pickle

import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Upload json files

In [1022]:
#files= ['0.json', '1.json', '2.json', '3.json', '4.json','42.json', '15693.json']
files = ['sample.json']
# create an empty list to store the DataFrames
dfsList = [pd.read_json(file, orient='records') for file in files]


# combine all DataFrames into a single DataFrame
crossrefDF = pd.concat(dfsList, ignore_index=True)

# Data preparation 

In [1023]:
noAuthors = [i for i in range(len(crossrefDF)) if 'author' not in crossrefDF['items'][i]]

Authors = [i for i in range(len(crossrefDF)) if 'author'  in crossrefDF['items'][i]]

## Rows with authors

In [1024]:
crossrefAuth = crossrefDF.iloc[Authors].copy()

crossrefAuth.reset_index(inplace= True)
crossrefAuth.drop(columns = ['index'], inplace = True)


## Extract 'DOI', authors --- number of authors

In [1025]:
crossrefAuth.loc[:, 'DOI'] = crossrefAuth['items'].apply(lambda x: x['DOI'])
crossrefAuth.loc[:,'authors'] = crossrefAuth['items'].apply(lambda x: x['author'])

numAuthors = [len(crossrefAuth.iloc[i]['authors']) for i in range(len(crossrefAuth))]

crossrefAuth.loc[:,'# authors'] = numAuthors

## Extract 'affiliations' --- number of affiliations

In [1026]:
def getAff(k):
   return [crossrefAuth['authors'][k][j]['affiliation'] for j in range(len(crossrefAuth['authors'][k]))]
    
Affiliations = [getAff(k) for k in range(len(crossrefAuth))]

crossrefAuth.loc[:,'affiliations'] = Affiliations

numAffil = [len(Affiliations[i]) for i in range(len(crossrefAuth))]

crossrefAuth.loc[:,'# Affil'] = numAffil

## Clean 'empty' affiliations

In [1027]:
possibleEmptyAff = []

for k in range(len(crossrefAuth)):
    if len(crossrefAuth['affiliations'][k][0]) == 0:
        possibleEmptyAff.append(k)

In [1028]:
len(possibleEmptyAff)

610

In [1029]:
nonEmptyAff = []

for k in possibleEmptyAff:
    for j in range(len(crossrefAuth['affiliations'].iloc[k])):
        if len(crossrefAuth['affiliations'].iloc[k][j]) != 0:
            nonEmptyAff.append(k)
    
FinalEmptyyAff =  [x for x in possibleEmptyAff if x not in nonEmptyAff] 
FinalNonEmptyAff = [x for x in range(len(crossrefAuth)) if x not in FinalEmptyyAff]

# doiDF: crossrefAuth subdataframe with nonpempty affiliation lists

In [1030]:
doiDF = crossrefAuth.iloc[FinalNonEmptyAff].copy()
doiDF.reset_index(inplace = True)
doiDF.drop(columns = ['index'], inplace = True)

## (still some cleaning: cases with empty brackets [{}])

In [1031]:
for k in range(len(doiDF)):
    if len(doiDF['affiliations'][k][0]) != 0 and doiDF['affiliations'][k][0][0] == {}:
        print(k)

In [1032]:
emptyBrackets = [k for k in range(len(doiDF)) if len(doiDF['affiliations'][k][0]) != 0 and doiDF['affiliations'][k][0][0] == {}]

In [1033]:
doiDF.iloc[emptyBrackets]

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil


In [1034]:
doiDF.drop(emptyBrackets, inplace = True)

In [1035]:
doiDF.reset_index(inplace = True)

In [1036]:
doiDF.drop(columns = ['index'], inplace = True)

In [1037]:
len(doiDF)

278

# Clean affiliations 

## is_contained(a,b) map : returns true when a is a substring of b 

In [1038]:
def is_contained(s, w):
    words = s.split()  # Split the string 's' into a list of words
    for word in words:
        if word not in w:  # If a word from 's' is not found in 'w'
            return False  # Return False immediately
    return True  # If all words from 's' are found in 'w', return True

## 1. "Unique" affiliations --- number of unique affiliations

In [1039]:
uniqueAff = []
error_indices =[] # New list to store error indices
for i in range(len(doiDF)):
    try:
        uniqueAff.append(list(set([x[0] for x in [list(d.values()) for d in [item for sublist in doiDF['affiliations'].iloc[i] for item in sublist if sublist !=[{}] and item !={}]]])))
    except TypeError:
        print("Error occurred for i =", i)
        error_indices.append(i)  # Save the index where the error occurred
    #except IndexError:
     #   print("IndexError occurred for i =", i)
      #  error_indices.append(i)  # Save the index where the IndexError occurred


# Print the error indices
print("Error indices:", error_indices)

Error indices: []


In [1040]:
doiDF.drop(error_indices, inplace = True)
doiDF.reset_index(inplace = True)
doiDF.drop(columns = ['index'], inplace = True)

In [1041]:
doiDF.loc[:,'uniqueAff'] = uniqueAff

numUniqueAff = [len(doiDF['uniqueAff'].iloc[i]) for i in range(len(doiDF))]

doiDF.loc[:,'# uniqueAff'] = numUniqueAff

## 2. Remove stop words ['from', 'the']

In [1042]:
doiDF.loc[:,'uniqueAff1'] = doiDF['uniqueAff'].apply(lambda x: [s.lower() for s in x])


In [1043]:
stopWords = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and']

In [1044]:
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopWords]
    return ' '.join(filtered_words)


# apply the function to the column  doiDF['uniqueAff'] to create column doiDF.loc[:,'uniqueAff1']

doiDF.loc[:,'uniqueAff1'] = doiDF['uniqueAff'].apply(lambda x: [remove_stop_words(s) for s in x])


## 3. Remove parenthesis 

In [1045]:
def remove_parentheses(text):
   return re.sub(r'\([^()]*\)', '', text)

# apply the function to each list element of column doiDF['uniqueAff1'] to remove substrings inside parentheses

doiDF.loc[:,'uniqueAff1'] = doiDF['uniqueAff1'].apply(lambda x: [remove_parentheses(s) for s in x])


## 4. Remove @#$%characters and umlauts

In [1046]:
def replace_umlauts(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
    return replaced_text

affNoSymbols = []

for i in range(len(list(doiDF['uniqueAff1']))):
    L = list(doiDF['uniqueAff1'])[i]
    for j in range(len(L)):
        L[j] = re.sub(r'[^\w\s,Α-Ωα-ωぁ-んァ-ン一-龯，]', '', L[j])
        L[j] = L[j].replace("  ", " ")
        L[j] = replace_umlauts(L[j])
        
    affNoSymbols.append(L)



In [1047]:
affNoSymbols

[['Key Laboratory Special Functional Smart Polymer Materials Ministry Industry Information Technology, School Chemistry Chemical Engineering, Northwestern Polytechnical University, Xian, Shaanxi 710072, China',
  'College Bioresources Chemical Materials Engineering, National Demonstration Center Experimental Light Chemistry Engineering Education, Shaanxi University Science Technology, Xian 710021, China'],
 ['School Mechanical Engineering Automation, Harbin Institute Technology, Shenzhen518055, China',
  'Shenzhen Polytechnic, Shenzhen518055, China',
  'College Physics Optoelectronic Engineering, Shenzhen University, Shenzhen518055, China'],
 ['Communal Institution Higher Education Dnipro Academy Continuing Education Dnipropetrovsk Regional Council'],
 ['Department Immunohaematology Blood Bank, Leiden University Hospital, The Netherlands'],
 ['Associate Professor, Dept Civil Engineering, 1200 Larimer St, Univ Colorado Denver, Denver, CO 802173364',
  'Assistant Professor, Dept Civil En

In [1048]:
affNoSymbols = [[item for item in inner_list if item != "inc"] for inner_list in affNoSymbols]

doiDF['uniqueAff1'] = affNoSymbols

## 5. Check 'sub'-affiliations (affiliations that are contained in other affiliations of the same DOI)

In [1049]:
newAff0 = []

for k in range(len(doiDF)):
    
    L2 = []
    for s1 in doiDF['uniqueAff1'].iloc[k]:
        is_substring = False
        for s2 in doiDF['uniqueAff1'].iloc[k]:
            if s1 != s2 and s1 in s2:
                is_substring = True
                break
        if not is_substring:
            L2.append(s1)
    newAff0.append(L2)

In [1050]:
newAffList = [list(set(newAff0[k])) for k in range(len(newAff0))]
doiDF['Unique affiliations'] = newAffList

In [1051]:
allAffsList = []

for doi in newAffList:
    for aff in doi:
        if aff not in allAffsList:
            allAffsList.append(aff)
        

In [1052]:
allAffsList[308]


'University Washington Fred Hutchinson Cancer Center'

## 6. Split strings where ',' or ';' appears    | Apply .lower()

In [1053]:
def substringsDict(string):
    split_strings = [re.sub(r'^[\s.]+|[\s.]+$', '', s.strip()) for s in re.split(r'[,;]', string)]
    dict_string = {}
    index = 0

    for value in split_strings:
        if value:
            modified_value = re.sub(r'\buniversit\w*', 'universit', value, flags=re.IGNORECASE)
            dict_string[index] = modified_value.lower()
            index += 1

    return dict_string

In [1054]:
newAffkomma = []

for aff in allAffsList:
    newAffkomma.append(substringsDict(aff))



In [1055]:
newAffkomma

[{0: 'college bioresources chemical materials engineering',
  1: 'national demonstration center experimental light chemistry engineering education',
  2: 'shaanxi universit science technology',
  3: 'xian 710021',
  4: 'china'},
 {0: 'key laboratory special functional smart polymer materials ministry industry information technology',
  1: 'school chemistry chemical engineering',
  2: 'northwestern polytechnical universit',
  3: 'xian',
  4: 'shaanxi 710072',
  5: 'china'},
 {0: 'school mechanical engineering automation',
  1: 'harbin institute technology',
  2: 'shenzhen518055',
  3: 'china'},
 {0: 'shenzhen polytechnic', 1: 'shenzhen518055', 2: 'china'},
 {0: 'college physics optoelectronic engineering',
  1: 'shenzhen universit',
  2: 'shenzhen518055',
  3: 'china'},
 {0: 'communal institution higher education dnipro academy continuing education dnipropetrovsk regional council'},
 {0: 'department immunohaematology blood bank',
  1: 'leiden universit hospital',
  2: 'the netherlands'}

In [1056]:
newAffkomma[308]

{0: 'universit washington fred hutchinson cancer center'}

In [1057]:
for dict in newAffkomma:
    
    if len(dict)>1:
        for i in range(len(dict)-1):
            if is_contained('progr', dict[i]) and is_contained('dep', dict[i+1]):
                del dict[i]
            elif (is_contained('assistant', dict[i]) or is_contained('researcher', dict[i]) or is_contained('phd', dict[i]) or is_contained('student', dict[i]) or is_contained('section', dict[i]) or is_contained('prof', dict[i]) or is_contained('director', dict[i])) and (not is_contained('school', dict[i+1]) or is_contained('univ', dict[i+1]) or is_contained('inst', dict[i+1]) or is_contained('lab', dict[i+1]) or is_contained('fac', dict[i+1])):
                del dict[i]
            elif (is_contained('engineer', dict[i]) or is_contained('progr', dict[i]) or is_contained('unit', dict[i]) or is_contained('lab', dict[i]) or is_contained('dep', dict[i]) or is_contained('inst', dict[i]) or is_contained('hosp', dict[i]) or is_contained('school', dict[i]) or is_contained('fac', dict[i])) and is_contained('univ', dict[i+1]):
                del dict[i]
            elif is_contained('lab', dict[i]) and (is_contained('college', dict[i+1]) or is_contained('inst', dict[i+1]) or is_contained('dep', dict[i+1]) or is_contained('school', dict[i+1])):
                del dict[i]
            elif is_contained('dep', dict[i]) and (is_contained('tech', dict[i+1]) or is_contained('college', dict[i+1]) or is_contained('inst', dict[i+1]) or  is_contained('hosp', dict[i+1]) or  is_contained('school', dict[i+1]) or  is_contained('fac', dict[i+1])):
                del dict[i]
            elif is_contained('inst',dict[i]) and (is_contained('dep', dict[i+1]) or is_contained('acad', dict[i+1]) or is_contained('hosp', dict[i+1]) or is_contained('fac', dict[i+1]) or is_contained('cent', dict[i+1]) or is_contained('div', dict[i+1])):
                del dict[i]
            elif is_contained('hosp',dict[i]) and is_contained('school', dict[i+1]):
                del dict[i]
         #   elif is_contained('hos',dict[i]) and (is_contained('cen', dict[i+1]):
         #       del dict[i+1]

In [1058]:
lightAff = []
for dict in newAffkomma:
    lightAff.append(', '.join(list(dict.values())))
    

In [1059]:
lightAff[280:310]

['lerner marine laboratory from, miami, florida, urbana, illinois, bimini, bahamas',
 'national yang ming chiao tung universit, hsinchu, taiwan',
 'universit zagreb, croatia',
 'medical college miyazaki from, kiyotake, miyazaki 88916, japan',
 'division allergy pulmonary disease, childrens hospital, stanford, ca',
 'hse universit, moscow',
 'институт проблем передачи информации им аа харкевича россиискои академии наук, г москва',
 'universit malaga, испания',
 'математическии институт им ва стеклова россиискои академии наук, г москва',
 'steklov mathematical institute russian academy sciences, moscow',
 'institute information transmission problems russian academy sciences, moscow',
 'национальныи исследовательскии университет высшая школа экономики, г москва',
 'universit maryland college park, department computer science',
 'universit memphis, department computer science',
 'universit british columbia, vancouver, canada',
 'simon fraser universit, burnaby, canada',
 'starling minds, v

In [1173]:
lightAff[308]

'universit washington fred hutchinson cancer center'

In [1061]:
removeList = ['university','research institute','laboratory' , 'universit','gmbh', 'inc', 'university of', 'research center', 
'university college','national institute of', 'school of medicine', "university school", 'graduate school of', 'graduate school of engineering', 
'institute of tropical medicine', 'institute of virology', 'faculty of medicine','laboratory', 'university park', 'institute of science','Polytechnic University']

city_names = ["Aberdeen", "Abilene", "Akron", "Albany", "Albuquerque", "Alexandria", "Allentown", "Amarillo", "Anaheim", "Anchorage", "Ann Arbor", "Antioch", "Apple Valley", "Appleton", "Arlington", "Arvada", "Asheville", "Athens", "Atlanta", "Atlantic City", "Augusta", "Aurora", "Austin", "Bakersfield", "Baltimore", "Barnstable", "Baton Rouge", "Beaumont", "Bel Air", "Bellevue", "Berkeley", "Bethlehem", "Billings", "Birmingham", "Bloomington", "Boise", "Boise City", "Bonita Springs", "Boston", "Boulder", "Bradenton", "Bremerton", "Bridgeport", "Brighton", "Brownsville", "Bryan", "Buffalo", "Burbank", "Burlington", "Cambridge", "Canton", "Cape Coral", "Carrollton", "Cary", "Cathedral City", "Cedar Rapids", "Champaign", "Chandler", "Charleston", "Charlotte", "Chattanooga", "Chesapeake", "Chicago", "Chula Vista", "Cincinnati", "Clarke County", "Clarksville", "Clearwater", "Cleveland", "College Station", "Colorado Springs", "Columbia", "Columbus", "Concord", "Coral Springs", "Corona", "Corpus Christi", "Costa Mesa", "Dallas", "Daly City", "Danbury", "Davenport", "Davidson County", "Dayton", "Daytona Beach", "Deltona", "Denton", "Denver", "Des Moines", "Detroit", "Downey", "Duluth", "Durham", "El Monte", "El Paso", "Elizabeth", "Elk Grove", "Elkhart", "Erie", "Escondido", "Eugene", "Evansville", "Fairfield", "Fargo", "Fayetteville", "Fitchburg", "Flint", "Fontana", "Fort Collins", "Fort Lauderdale", "Fort Smith", "Fort Walton Beach", "Fort Wayne", "Fort Worth", "Frederick", "Fremont", "Fresno", "Fullerton", "Gainesville", "Garden Grove", "Garland", "Gastonia", "Gilbert", "Glendale", "Grand Prairie", "Grand Rapids", "Grayslake", "Green Bay", "GreenBay", "Greensboro", "Greenville", "Gulfport-Biloxi", "Hagerstown", "Hampton", "Harlingen", "Harrisburg", "Hartford", "Havre de Grace", "Hayward", "Hemet", "Henderson", "Hesperia", "Hialeah", "Hickory", "High Point", "Hollywood", "Honolulu", "Houma", "Houston", "Howell", "Huntington", "Huntington Beach", "Huntsville", "Independence", "Indianapolis", "Inglewood", "Irvine", "Irving", "Jackson", "Jacksonville", "Jefferson", "Jersey City", "Johnson City", "Joliet", "Kailua", "Kalamazoo", "Kaneohe", "Kansas City", "Kennewick", "Kenosha", "Killeen", "Kissimmee", "Knoxville", "Lacey", "Lafayette", "Lake Charles", "Lakeland", "Lakewood", "Lancaster", "Lansing", "Laredo", "Las Cruces", "Las Vegas", "Layton", "Leominster", "Lewisville", "Lexington", "Lincoln", "Little Rock", "Long Beach", "Lorain", "Los Angeles", "Louisville", "Lowell", "Lubbock", "Macon", "Madison", "Manchester", "Marina", "Marysville", "McAllen", "McHenry", "Medford", "Melbourne", "Memphis", "Merced", "Mesa", "Mesquite", "Miami", "Milwaukee", "Minneapolis", "Miramar", "Mission Viejo", "Mobile", "Modesto", "Monroe", "Monterey", "Montgomery", "Moreno Valley", "Murfreesboro", "Murrieta", "Muskegon", "Myrtle Beach", "Naperville", "Naples", "Nashua", "Nashville", "New Bedford", "New Haven", "New London", "New Orleans", "New York", "New York City", "Newark", "Newburgh", "Newport News", "Norfolk", "Normal", "Norman", "North Charleston", "North Las Vegas", "North Port", "Norwalk", "Norwich", "Oakland", "Ocala", "Oceanside", "Odessa", "Ogden", "Oklahoma City", "Olathe", "Olympia", "Omaha", "Ontario", "Orange", "Orem", "Orlando", "Overland Park", "Oxnard", "Palm Bay", "Palm Springs", "Palmdale", "Panama City", "Pasadena", "Paterson", "Pembroke Pines", "Pensacola", "Peoria", "Philadelphia", "Phoenix", "Pittsburgh", "Plano", "Pomona", "Pompano Beach", "Port Arthur", "Port Orange", "Port Saint Lucie", "Port St. Lucie", "Portland", "Portsmouth", "Poughkeepsie", "Providence", "Provo", "Pueblo", "Punta Gorda", "Racine", "Raleigh", "Rancho Cucamonga", "Reading", "Redding", "Reno", "Richland", "Richmond", "Richmond County", "Riverside", "Roanoke", "Rochester", "Rockford", "Roseville", "Round Lake Beach", "Sacramento", "Saginaw", "Saint Louis", "Saint Paul", "Saint Petersburg", "Salem", "Salinas", "Salt Lake City", "San Antonio", "San Bernardino", "San Buenaventura", "San Diego", "San Francisco", "San Jose", "Santa Ana", "Santa Barbara", "Santa Clara", "Santa Clarita", "Santa Cruz", "Santa Maria", "Santa Rosa", "Sarasota", "Savannah", "Scottsdale", "Scranton", "Seaside", "Seattle", "Sebastian", "Shreveport", "Simi Valley", "Sioux City", "Sioux Falls", "South Bend", "South Lyon", "Spartanburg", "Spokane", "Springdale", "Springfield", "St. Louis", "St. Paul", "St. Petersburg", "Stamford", "Sterling Heights", "Stockton", "Sunnyvale", "Syracuse", "Tacoma", "Tallahassee", "Tampa", "Temecula", "Tempe", "Thornton", "Thousand Oaks", "Toledo", "Topeka", "Torrance", "Trenton", "Tucson", "Tulsa", "Tuscaloosa", "Tyler", "Utica", "Vallejo", "Vancouver", "Vero Beach", "Victorville", "Virginia Beach", "Visalia", "Waco", "Warren", "Washington", "Waterbury", "Waterloo", "West Covina", "West Valley City", "Westminster", "Wichita", "Wilmington", "Winston", "Winter Haven", "Worcester", "Yakima", "Yonkers", "York", "Youngstown"]

city_names = [x.lower() for x in city_names]

In [1062]:
for dict in newAffkomma:
    for i in list(dict.keys()):

        if dict[i] in city_names+removeList:
            del dict[i]


In [1063]:
affDF = pd.DataFrame()
affDF['Original Affiliations'] = allAffsList
affDF['Light Affiliations'] = lightAff
affDF['Keywords'] =  [list(d.values()) for d in newAffkomma]


In [1064]:
affDF.iloc[308]

Original Affiliations    University Washington Fred Hutchinson Cancer C...
Light Affiliations       universit washington fred hutchinson cancer ce...
Keywords                 [universit washington fred hutchinson cancer c...
Name: 308, dtype: object

In [1065]:
affDF[affDF['Keywords'] == 'virginia tech']

Unnamed: 0,Original Affiliations,Light Affiliations,Keywords


# Labels based on legalnames of openAIRE's organizations

In [1066]:
uniList = ['institu', 'istitut', 'univ', 'coll', 'center','polytechnic', 'centre' , 'cnrs', 'faculty','school' , 'academy' , 'akadem','école', 'hochschule' , 'ecole', 'tech', 'observ']

labList = ['lab']

hosplList = ['hospital' ,'clinic', 'hôpital', 'klinik','oncol','medical']

gmbhList = ['gmbh', 'company' , 'industr', 'etaireia' , 'corporation', 'inc']

musList =  ['museum', 'library']

foundList =  ['foundation' , 'association','organization' ,'society', 'group' ]

deptList = ['district' , 'federation'  , 'government' , 'municipal' , 'county','council', 'agency']
# miistry -> out

unknownList = ['unknown']

#######   Dictionaries ##########

uniDict = {k: 'Univ/Inst' for k in uniList}   

labDict = {k: 'Laboratory' for k in labList} 

hosplDict = {k: 'Hospital' for k in hosplList}   

gmbhDict = {k: 'Company' for k in gmbhList}   

musDict = {k: 'Museum' for k in musList}   

#schoolDict = {k: 'School' for k in schoolList}   

foundDict = {k: 'Foundation' for k in foundList}   

deptDict = {k: 'Government' for k in deptList}   

unknownDict =  {k: 'Unknown' for k in unknownList}   

categDictsList = [uniDict, labDict, hosplDict, gmbhDict, musDict, #schoolDict, 
                  foundDict, deptDict, unknownDict]

################# Final Dictionary #####################

categDicts = {}
i = 0
while i in range(len(categDictsList)):
    categDicts.update(categDictsList[i])
    i = i+1
    
    


## affiliationsDict

In [1067]:
affiliationsDict = {}

for i in range(len(affDF)):
    affiliationsDict[i] = affDF['Keywords'].iloc[i]

In [1068]:
d_new = {}

# iterate over the keys of affiliationsDict
for k in range(len(affiliationsDict)):
    # get the list associated with the current key in affiliationsDict
    L = affiliationsDict.get(k, [])
    mapped_listx = [[s, v] for s in L for k2, v in categDicts.items() if k2 in s]
    

    # add the mapped list to the new dictionary d_new
    d_new[k] = mapped_listx

In [1069]:
affDF['Dictionary'] = list(d_new.values())

In [1070]:
notInList = [i for i in range(len(affDF)) if affDF['Dictionary'].iloc[i] == []]
    

In [1071]:
len(doiDF)  - len(notInList)

241

In [1072]:
len(notInList)

37

In [1073]:
affDF['Original Affiliations'].iloc[243]

'Department Biomedical Engineering Mechanics, Virginia Tech, Blacksburg, VA, USA'

In [1074]:
affDF[240:245]

Unnamed: 0,Original Affiliations,Light Affiliations,Keywords,Dictionary
240,"Division Dermatology, UCLA School Medicine 90024","division dermatology, ucla school medicine 90024","[division dermatology, ucla school medicine 90...","[[ucla school medicine 90024, Univ/Inst]]"
241,"Department Chemistry, University British Colum...","universit british columbia, vancouver, bc, can...","[universit british columbia, bc, canada, v6t 1z1]","[[universit british columbia, Univ/Inst]]"
242,"Serra Hx00FAnter Fellow Programme, Universitat...","universit pompeu fabra, barcelona, spain","[universit pompeu fabra, barcelona, spain]","[[universit pompeu fabra, Univ/Inst]]"
243,"Department Biomedical Engineering Mechanics, V...","virginia tech, blacksburg, va, usa","[virginia tech, blacksburg, va, usa]","[[virginia tech, Univ/Inst]]"
244,"BCN MedTech, Department Information Communicat...","bcn medtech, universit pompeu fabra, barcelona...","[bcn medtech, universit pompeu fabra, barcelon...","[[bcn medtech, Univ/Inst], [universit pompeu f..."


# doiDF1 ['DOI', 'affiliations', 'Dictionary', 'uniqueAff2','# authors','# uniqueAff']

## New column: category based on the labels 

In [1075]:
category = [', '.join(list(set([x[1] for x in affDF['Dictionary'].iloc[i]]))) for i in range(len(affDF))]
    


In [1076]:
affDF.loc[:, 'Category'] = category


### new label: rest

In [1077]:
for i in range(len(affDF)):
    if affDF['Category'].iloc[i] == '':
        affDF.iloc[i, affDF.columns.get_loc('Category')] = 'Rest'


In [1078]:
affiliationsSimple = [
    list(set([x[0] for x in affDF['Dictionary'].iloc[i]]))
    for i in range(len(affDF))
]


In [1079]:
affDF['Keywords'] = affiliationsSimple

In [1080]:
affiliationsSimple[308]

['universit washington fred hutchinson cancer center']

# radius

In [1081]:
def strRadiusU(string):
    string = string.lower()
    radius = 3
    
    strList = string.split()
    indices = []
    result = []

    for i, x in enumerate(strList):
        if is_contained('univers',x):
            indices.append(i)
            
    for r0 in indices:
        lmin =max(0,r0-radius)
        lmax =min(r0+radius, len(strList))
        s = strList[lmin:lmax]
        
        result.append(' '.join(s))
    
    return result 

In [1082]:
strRadiusU('universit washington and fred hutchinson cancer center')

['universit washington and']

In [1083]:
def strRadiusH(string):
    string = string.lower()
    radius = 3
    
    strList = string.split()
    indices = []
    result = []

    for i, x in enumerate(strList):
        if is_contained('hospital',x):
            indices.append(i)
            
    for r0 in indices:
        lmin =max(0,r0-radius-1)
        lmax =min(r0+radius, len(strList))
        s = strList[lmin:lmax]
        
        result.append(' '.join(s))
    
    return result 

In [1084]:
strRadiusH('kinki central hospital mutual aid')

['kinki central hospital mutual aid']

In [1085]:
def strRadiusC(string):
    string = string.lower()
    radius = 2
    
    strList = string.split()
    indices = []
    result = []

    for i, x in enumerate(strList):
        if is_contained('clinic',x) or is_contained('klinik',x):
            indices.append(i)
            
    for r0 in indices:
        lmin =max(0,r0-radius-1)
        lmax =min(r0+radius, len(strList))
        s = strList[lmin:lmax]
        
        result.append(' '.join(s))
    
    return result 

In [1086]:
strRadiusC('scripps clinic medical group')

['scripps clinic medical']

In [1087]:
affiliationsSimple[134]

['tokushima universit']

In [1088]:
te = 'E Daguer, Interventional Radiology, Hospital Universitario Fundacion Jimenez Diaz, Madrid, Spain'

In [1089]:
'universit' in x.lower()

True

In [1090]:
affiliationsSimpleN = []

for i in range(len(affiliationsSimple)):
    inner = []
    for str in affiliationsSimple[i]:
        if 'universit' in str:
            for x in strRadiusU(str):
                inner.append(x)
        elif 'hospital' in str or 'hôpital' in str:
            for x in strRadiusH(str):
                inner.append(x)
        elif 'clinic' in str or 'klinik' in str:
            for x in strRadiusH(str):
                inner.append(x)
                
        else:
            inner.append(str)
    
    affiliationsSimpleN.append(inner)      
            

In [1091]:
print([x])

['northwestern universit']


In [1092]:
inner = []
for str in [te.lower()]:
    if 'universit' in str:
        for x in strRadiusU(str):
            inner.append(x)
    elif  'hospital' in str or 'hôpital' in str:
        print('im here')
        for x in strRadiusH(str):
            inner.append(x)
    elif 'clinic' in str or 'klinik' in str:
        for x in strRadiusH(str):
            inner.append(x)
            
    else:
        inner.append(str)
print(inner)
print([te])

['interventional radiology, hospital universitario fundacion jimenez']
['E Daguer, Interventional Radiology, Hospital Universitario Fundacion Jimenez Diaz, Madrid, Spain']


In [1093]:
if 'universit' not in te.split() and ( 'hospital' in te.split() or 'hôpital' in te.split()):
    print('dfg')

In [1094]:
te.lower()

'e daguer, interventional radiology, hospital universitario fundacion jimenez diaz, madrid, spain'

In [1095]:
te.lower().split()

['e',
 'daguer,',
 'interventional',
 'radiology,',
 'hospital',
 'universitario',
 'fundacion',
 'jimenez',
 'diaz,',
 'madrid,',
 'spain']

In [1096]:
'universitario' in (te.lower()).split()

True

In [1097]:
strRadiusU('Hospital Universitario Fundacion Jimenez Diaz')

['hospital universitario fundacion jimenez']

In [1098]:
affDF['Keywords'] = affiliationsSimpleN

# UNIVS & LABS

In [1099]:
affDF['Category'].unique()

array(['Univ/Inst', 'Government, Univ/Inst', 'Univ/Inst, Hospital',
       'Laboratory, Univ/Inst', 'Rest', 'Company, Hospital',
       'Univ/Inst, Laboratory', 'Laboratory, Univ/Inst, Hospital',
       'Company, Univ/Inst', 'Laboratory', 'Foundation', 'Hospital',
       'Univ/Inst, Foundation, Hospital', 'Foundation, Hospital',
       'Government, Univ/Inst, Hospital', 'Government',
       'Company, Laboratory', 'Company',
       'Univ/Inst, Laboratory, Hospital', 'Univ/Inst, Foundation',
       'Laboratory, Hospital'], dtype=object)

In [1100]:
univLabs = [i for i in range(len(affDF)) if 'Laboratory' in affDF['Category'].iloc[i] 
            or 'Univ/Inst' in  affDF['Category'].iloc[i]]

In [1101]:
affDF['Category'].iloc[12]

'Univ/Inst, Hospital'

In [1102]:
12 in univLabs

True

In [1103]:
len(univLabs)

407

In [1104]:
univLabsEtc = [i for i in range(len(affDF)) if 'Laboratory' in affDF['Category'].iloc[i] 
            or 'Univ/Inst' in  affDF['Category'].iloc[i]]

In [1105]:
 'Univ/Inst' in affDF['Category'].iloc[12]

True

In [1106]:

univLabsDF = affDF.iloc[univLabs].copy()
univLabsDF.reset_index(inplace = True)
univLabsDF.drop(columns = ['index'], inplace = True)

In [1107]:
affDF['Category'].iloc[6]

'Univ/Inst, Hospital'

# Etc

In [1108]:
Etc = [i for i in range(len(affDF)) if i not in univLabs and ('Hospital'  in  affDF['Category'].iloc[i]
            or 'Foundation' in  affDF['Category'].iloc[i]
            or 'Company' in  affDF['Category'].iloc[i]
            or 'Museum' in  affDF['Category'].iloc[i]
            or 'Government' in  affDF['Category'].iloc[i])]


EtcDF = affDF.iloc[Etc].copy()
EtcDF.reset_index(inplace = True)
EtcDF.drop(columns = ['index'], inplace = True)

print(len(Etc), len(Etc)/len(affDF))

23 0.04925053533190578


# Hospitals

In [1109]:
hospitals = [i for i in range(len(affDF)) if i not in univLabs and 'Hospital' in affDF['Category'].iloc[i]]

In [1110]:

hospitalDF = affDF.iloc[hospitals].copy()
hospitalDF.reset_index(inplace = True)
hospitalDF.drop(columns = ['index'], inplace = True)

In [1111]:
hospitalDF

Unnamed: 0,Original Affiliations,Light Affiliations,Keywords,Dictionary,Category
0,Anhui No 2 Provincial Peoples Hospital,anhui no 2 provincial peoples hospital,[no 2 provincial peoples hospital],"[[anhui no 2 provincial peoples hospital, Hosp...","Company, Hospital"
1,Beijing Jishuitan Hospital,beijing jishuitan hospital,[beijing jishuitan hospital],"[[beijing jishuitan hospital, Hospital]]",Hospital
2,Jilin Provincial Hospital Tuberculosis,jilin provincial hospital tuberculosis,[jilin provincial hospital tuberculosis],"[[jilin provincial hospital tuberculosis, Hosp...","Company, Hospital"
3,Beijing Heping Li Hospital,beijing heping li hospital,[beijing heping li hospital],"[[beijing heping li hospital, Hospital]]",Hospital
4,Changchun Infectious Disease Hospital,changchun infectious disease hospital,[changchun infectious disease hospital],"[[changchun infectious disease hospital, Hospi...",Hospital
5,"Argonne Cancer Research Hospital , , 1 Chicago...","argonne cancer research hospital, 1 chicago, i...",[argonne cancer research hospital],"[[argonne cancer research hospital, Hospital]]",Hospital
6,"1Immunology, Mayo Clinic, Rochester, MN","1immunology, mayo clinic, rochester, mn",[],"[[mayo clinic, Hospital]]",Hospital
7,"4Health Science Research, Mayo Clinic, Rochest...","4health science research, mayo clinic, rochest...",[],"[[mayo clinic, Hospital]]",Hospital
8,"Department Neurosurgery, Hanwa Memorial Hospital",hanwa memorial hospital,[hanwa memorial hospital],"[[hanwa memorial hospital, Hospital]]",Hospital
9,"Department Molecular Immunology, Scripps Clini...","department molecular immunology, scripps clini...",[],"[[scripps clinic research foundation from, Hos...","Foundation, Hospital"


# REST

In [1112]:
rest = [i for i in range(len(affDF)) if affDF['Category'].iloc[i] == 'Rest']

In [1113]:
restDF = affDF.iloc[rest].copy()
restDF.reset_index(inplace = True)
restDF.drop(columns = ['index'], inplace = True)

In [1114]:
len(restDF)/len(affDF)

0.07922912205567452

In [1115]:
affDF['Category'].unique()

array(['Univ/Inst', 'Government, Univ/Inst', 'Univ/Inst, Hospital',
       'Laboratory, Univ/Inst', 'Rest', 'Company, Hospital',
       'Univ/Inst, Laboratory', 'Laboratory, Univ/Inst, Hospital',
       'Company, Univ/Inst', 'Laboratory', 'Foundation', 'Hospital',
       'Univ/Inst, Foundation, Hospital', 'Foundation, Hospital',
       'Government, Univ/Inst, Hospital', 'Government',
       'Company, Laboratory', 'Company',
       'Univ/Inst, Laboratory, Hospital', 'Univ/Inst, Foundation',
       'Laboratory, Hospital'], dtype=object)

In [1116]:
len(univLabsDF)/len(affDF)

0.8715203426124197

In [1117]:
subs = [substringsDict(w) for w in list(restDF['Light Affiliations'])]

# Load files from openAIRE

In [1118]:
#with open('dixOpenAIRE_Alletc.pkl', 'rb') as f:
#    dixOpenAIRE_Alletc = pickle.load(f)

#with open('dixOpenAIRE_id.pkl', 'rb') as f:
#    dixOpenAIRE_id = pickle.load(f)


In [1119]:

with open('dixOpenOrgId.pkl', 'rb') as f:
    dixOpenOrgId = pickle.load(f)

In [1120]:

with open('dixUnivEtcOpen.pkl', 'rb') as f:
    dixUnivEtcOpen  = pickle.load(f)

In [1121]:
#with open('dixOpenOrgId.pkl', 'rb') as f:
#    dixOpenOrgId = pickle.load(f)


## Clean/modify the files

In [1122]:
def filter_key(key):
    # Remove all non-alphanumeric characters except Greek letters and Chinese characters
    modified_key = re.sub(r'[^\w\s,Α-Ωα-ωぁ-んァ-ン一-龯，]', '', key)
    modified_key = re.sub(r'\buniversit\w*', 'universit', modified_key, flags=re.IGNORECASE)
    modified_key = modified_key.replace(' and ', ' ')
    return modified_key

    
def filter_dictionary_keys(dictionary):
    filtered_dict = {}
    for key, value in dictionary.items():
        filtered_key = filter_key(key)
        filtered_dict[filtered_key] = value
    return filtered_dict

In [1123]:
def cleanDict(dix):
    dix1 =  {k.replace(',', ''): v for k, v in dix.items()}
    
    dix1 = {replace_umlauts(key): value
    for key, value in dix1.items()}
    
    dix1 = filter_dictionary_keys(dix1)
    
    dix2 = {}
    
    for key, value in dix1.items():
        updated_key = ' '.join([word for word in key.split() if word.lower() not in stopWords])
        dix2[updated_key] = value
        
    for x in list(dix2.keys()):
        if len(x) <3:
            del dix2[x]
            
    if 'universit hospital' in list(dix2.keys()):
        del dix2['universit hospital']
        
    if 'universit school' in list(dix2.keys()):
        del dix2['universit school']
        
    if 'ni universit' in list(dix2.keys()):
        del dix2['ni universit']

        
    if 's v universit' in list(dix2.keys()):
        del dix2['s v universit']

    if 'k l universit' in list(dix2.keys()):
        del dix2['k l universit']
        
    return dix2
    

In [1124]:
dixOpenOrgId2 = cleanDict(dixOpenOrgId)

In [1125]:
dixUnivEtcOpen2 = cleanDict(dixUnivEtcOpen)

In [1126]:
def findID(name):
    lnames = []
    for x in list(dixOpenOrgId2.keys()):
        if name.lower() in x:
            lnames.append(x)
    return lnames

# MATCHINGS

## Helper functions

### Clean the matchings

In [1127]:
def bestSimScore(l1, l2, l3, l4, simU, simG):
    """
    Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
    ---> corrects special cases in the main map that follows

    Args:
        l1: List of light affiliations.
        l2: number of candidates.
        l3: List of pairs.
        l4: mult

    Returns:
        List: Resulting list containing OpenAIRE names and their similarity scores.
    """
    
    vectorizer = CountVectorizer()
    numUniv = sum([(l1[i].lower()).count('univ') for i in range(len(l1))])
    result = []
    for i in range(len(l1)):
        best = [] 
        s = l1[i]
       # s_contains_university = is_contained("university", s.lower())  
        
       # if not is_contained("univ", s.lower()):
       #     continue  # Skip if s does not contain "university" or "univ"
        
    
        for j in range(len(l3)):
            x = l3[j][1] 
           
            if [x, l3[j][2]] in result:
                    continue
            
            if l4[l3[j][0]] == 1:
               
                if  is_contained('univ', x.lower()) and  l3[j][2]> simU:
                    result.append([x, l3[j][2]])
                elif  l3[j][2] >simG:
                    result.append([x, l3[j][2]])

                
              
            elif l3[j][2] >=0.99 and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or  is_contained("center", x.lower()) or  is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
                result.append([l3[j][1], 1])
                
            else:
                try:
            #        x_contains_university = is_contained("university", x.lower())
                    if not is_contained("univ", x.lower()):
                        continue  # Skip if x does not contain "university" or "univ"
                    
                    if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)):
                        continue
                    s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
                    x_vector = vectorizer.transform([x]).toarray()
                #  s_vector1 =  vectorizer.transform([s]).toarray()
                #  x_vector1 =  vectorizer.fit_transform([s]).toarray()
                    

                    # Compute similarity between the vectors
                    similarity = cosine_similarity(x_vector, s_vector)[0][0]
                    if similarity> 0.1:
                # similarity1 = cosine_similarity(x_vector1, s_vector1)[0][0]
                    #similarity2 = Levenshtein.ratio(s,x)


                        best.append([x, similarity])#(similarity+similarity2)/2])
                except:
                    KeyError
                    
        if best:
            max_numbers = defaultdict(float)
            for item in best:
                string, number = item
                max_numbers[string] = max(max_numbers[string], number)

# Create a new list with the elements having the maximum number for each string
            reduced_best = [[string, number] for string, number in best if number == max_numbers[string]]

#            max_score = max(best, key=lambda x: x[1])[1]
#            max_results = [(x[0], x[1]) for x in best if x[1] == max_score]
           # if len(reduced_best) > 1:
            reduced_best.sort(key=lambda x: x[1], reverse=True)
            #reduced_best.sort(key=lambda x: (l2.index(x[0]), -x[1]), reverse=False)
           #     result.append(reduced_best[-1])
            #else:
            result = result + reduced_best
                
    univ_list = []
    other_list = []
    
    for r in result:
        if is_contained('univ',r[0]):
            univ_list.append(r)
        else:
            other_list.append(r)
    
    limit =  min(numUniv, l2)

    if len(univ_list)> limit:
        result = univ_list[:limit] + other_list
                
    return result

### Find rows with multiple mathcings

In [1128]:
def index_multipleMatchings(df):
    multipleMatchings = []
    mult = []

    for i in range(len(df)):
        result_dict = {}
        

        for t in [t[0] for t in df.Pairs.iloc[i]]:
            key = t
            if key in result_dict:
                result_dict[key] += 1
                multipleMatchings.append(i)
                
            else:
                result_dict[key] = 1
        mult.append(result_dict)
    return [list(set(multipleMatchings)), mult]

## Main map

In [1166]:
def Aff_Ids(m, DF, dixOpenAIRE, simU, simG):
    
    """
    Matches affiliations in DataFrame 'DF' with names from dictionary 'dixOpenAIRE' and their openAIRE ids based on similarity scores.

    Args:
        m (int): The number of DOIs to check.
        DF (DataFrame): The input DataFrame containing affiliation data.
        dixOpenAIRE (dict): A dictionary of names from OpenAIRE.
        simU (float): Similarity threshold for universities.
        simG (float): Similarity threshold for non-universities.

    Returns:
        DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
    """
    vectorizer = CountVectorizer()

    lnamelist = list(dixOpenAIRE.keys())
    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
    deiktes = []  # stores indeces where a match is found
    similarity_ab = [] # stores lists of similarity scores of the mathces 
    pairs = [] #  pairs[i] =  [ [s,x,t] ] where (s,x) is a match and t the corresponding similarity score
    
    for k in range(m):
        similar_k = []
        pairs_k = []


        for s in DF['Keywords'].iloc[k]:

            if s in lnamelist:
                deiktes.append(k)
                similarity = 1
                similar_k.append(similarity)
                
                pairs_k.append((s,s,similarity))

                if k not in dix:
                    dix[k] = [s]
                else:
                    dix[k].append(s)
            else:

                for x in lnamelist:
                    
                    if  is_contained(s, x):

                        x_vector = vectorizer.fit_transform([x]).toarray()
                        s_vector = vectorizer.transform([s]).toarray()

                        # Compute similarity between the vectors
                        similarity = cosine_similarity(x_vector, s_vector)[0][0]
                        if similarity > min(simU, simG):
                            if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
                                similar_k.append(similarity)
                                deiktes.append(k)
                                pairs_k.append((s,x,similarity))

                                if k not in dix:
                                    dix[k] = [x]
                                else:
                                    dix[k].append(x)
                            elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
                                similar_k.append(similarity)
                                deiktes.append(k)
                                pairs_k.append((s,x,similarity))

                                if k not in dix:
                                    dix[k] = [x]
                                else:
                                    dix[k].append(x)
                                    
                    elif is_contained(x, s):
                        if (is_contained('univ', s) and is_contained('univ', x)):

                            if ' and ' in s:
                                list_s = s.split(' and ')
                                
                                if list_s:
                                    for q in list_s:
                                        if is_contained('univ', q):

                                            q_vector = vectorizer.fit_transform([q]).toarray()
                                            x_vector = vectorizer.transform([x]).toarray()

                                # Compute similarity between the vectors
                                            similarity = cosine_similarity(q_vector, x_vector)[0][0]
                                            if similarity > simU:
                                                similar_k.append(similarity)
                                                deiktes.append(k)
                                                pairs_k.append((s,x,similarity))

                                                if k not in dix:
                                                    dix[k] = [x]
                                                else:
                                                    dix[k].append(x)
                            
                            else: 

                                s_vector = vectorizer.fit_transform([s]).toarray()
                                x_vector = vectorizer.transform([x]).toarray()

                                # Compute similarity between the vectors
                                similarity = cosine_similarity(s_vector, x_vector)[0][0]
                                if similarity > simU: #max(0.82,sim):
                                    similar_k.append(similarity)
                                    deiktes.append(k)
                                    pairs_k.append((s,x,similarity))

                                    if k not in dix:
                                        dix[k] = [x]
                                    else:
                                        dix[k].append(x)
                        elif not is_contained('univ', s) and not is_contained('univ', x):
                           # if 'and' in s:
                           #     list_s = s.split(' and ')
                           #     if list_s:
                           #         for t in list_s:
                           #             if is_contained(x, t):

                           #                 t_vector = vectorizer.fit_transform([t]).toarray()
                           #                 x_vector = vectorizer.transform([x]).toarray()

                                # Compute similarity between the vectors
                           #                 similarity = cosine_similarity(t_vector, x_vector)[0][0]
                           #                 if similarity > simG:
                           #                     similar_k.append(similarity)
                           #                     deiktes.append(k)
                           #                     pairs_k.append((s,x,similarity))

                           #                     if k not in dix:
                        #                        dix[k] = [x]
                            #                    else:
                            #                        dix[k].append(x)
                            #            
                            #            if is_contained(t, x):

                            #                x_vector = vectorizer.fit_transform([x]).toarray()
                            #                t_vector = vectorizer.transform([t]).toarray()

                                # Compute similarity between the vectors
                            #                similarity = cosine_similarity(t_vector, x_vector)[0][0]
                            #                if similarity > simG:
                            #                    similar_k.append(similarity)
                            #                    deiktes.append(k)
                            #                    pairs_k.append((s,x,similarity))

                             #                   if k not in dix:
                             #                       dix[k] = [x]
                             #                   else:
                             #                       dix[k].append(x)
                           # else: 

                            s_vector = vectorizer.fit_transform([s]).toarray()
                            x_vector = vectorizer.transform([x]).toarray()

                            # Compute similarity between the vectors
                            similarity = cosine_similarity(s_vector, x_vector)[0][0]
                            if similarity > simG: #max(0.82,sim):
                                similar_k.append(similarity)
                                deiktes.append(k)
                                pairs_k.append((s,x,similarity))

                                if k not in dix:
                                    dix[k] = [x]
                                else:
                                    dix[k].append(x)
                            
        similarity_ab.append(similar_k)   
        similarity_ab = [lst for lst in similarity_ab if lst != []]
        pairs.append(pairs_k)
        
 
    
    
## Define the new Dataframe
    
    affIdDF = pd.DataFrame()
    affIdDF['Original affiliations'] = list(DF['Original Affiliations'].iloc[list(set(deiktes))])

    affIdDF['Light affiliations'] = list(DF['Light Affiliations'].iloc[list(set(deiktes))])

    affIdDF['Candidates for matching'] = list(DF['Keywords'].iloc[list(set(deiktes))])


    affIdDF['Matched openAIRE names'] = list(dix.values())
    affIdDF['# Matched orgs'] = [len(list(dix.values())[i]) for i in range(len(list(dix.values())))]
    

    affIdDF['Similarity score'] = similarity_ab

    Pairs = [lst for lst in pairs if lst]
    affIdDF['Pairs'] = Pairs
    affIdDF['mult'] = index_multipleMatchings(affIdDF)[1]




## Correct the matchings
    needCheck = list(set([i for i in range(len(affIdDF)) for k in list(affIdDF['mult'].iloc[i].values()) if k>1]))
    

    ready = [i for i in range(len(affIdDF)) if i not in needCheck]
    
   
    best = [ bestSimScore([affIdDF['Light affiliations'].iloc[i]], len(affIdDF['Candidates for matching'].iloc[i]), affIdDF['Pairs'].iloc[i],affIdDF['mult'].iloc[i], simU, simG) for i in needCheck]
    best_o = []
    best_s = []
    
    for x in best:
        best_o.append([x[i][0]  for i in range(len(x))])
        best_s.append([round(x[i][1],2)  for i in range(len(x))])
    numMathced = [len(best_s[i]) for i in range(len(needCheck))]
    

    
    dfFinal0 = (affIdDF.iloc[ready]).copy()
    dfFinal0['index'] = ready
    
    dfFinal1 = (affIdDF.iloc[needCheck]).copy()
    dfFinal1['index'] = needCheck
    dfFinal1['Matched openAIRE names'] = best_o
    dfFinal1['Similarity score'] = best_s
    dfFinal1['# Matched orgs'] = numMathced
    
    finalDF =  pd.concat([dfFinal0, dfFinal1])
    finalDF.set_index('index', inplace=True)
    finalDF.sort_values('index', ascending=True, inplace = True)
    
    ids = [[dixOpenAIRE[x] for x in v] for v in finalDF['Matched openAIRE names']]
    numIds = [len(x) for x in ids]

    finalDF['IDs'] = ids
    finalDF['# IDs'] = numIds
    finalDF = finalDF[~(finalDF['# Matched orgs'] == 0)]
    
    finalDF = finalDF.reset_index(drop=True)
    perc = 100*len(finalDF)/m



    
    
    return [perc,finalDF, affIdDF, needCheck]
    


In [1167]:
result = Aff_Ids(len(univLabsDF), univLabsDF, dixUnivEtcOpen2, 0.7,0.82)

In [1168]:
result[2][150:170]

Unnamed: 0,Original affiliations,Light affiliations,Candidates for matching,Matched openAIRE names,# Matched orgs,Similarity score,Pairs,mult
150,Department Materials Science Engineering Sun Y...,department materials science engineering sun y...,[engineering sun yatsen universit guangzhou 51...,[sun yatsen universit],1,[0.7071067811865477],[(engineering sun yatsen universit guangzhou 5...,{'engineering sun yatsen universit guangzhou 5...
151,Guangzhou Key Laboratory Analytical Chemistry ...,guangzhou key laboratory analytical chemistry ...,[south china normal universit guangzhou 510006],"[south china normal universit, universit south...",2,"[0.8164965809277261, 0.7071067811865477]",[(south china normal universit guangzhou 51000...,{'south china normal universit guangzhou 51000...
152,"Department Microbiology, The George Washington...",the george washington universit medical center...,"[district columbia, the george washington univ...","[universit washington medical center, universi...",4,"[0.8164965809277261, 0.7071067811865477, 0.707...",[(the george washington universit medical cent...,{'the george washington universit medical cent...
153,"Laboratory Microbiology Immunology, National I...","national institute dental research, national i...","[district columbia, national institutes health...","[national institutes health, national research...",3,"[0.8660254037844388, 0.8660254037844388, 0.894...","[(national institutes health from, national in...","{'national institutes health from': 1, 'nation..."
154,"Institute General Pathology, University Palerm...","universit palermo, ismedacnr, italy",[universit palermo],[universit palermo],1,[1],"[(universit palermo, universit palermo, 1)]",{'universit palermo': 1}
155,"University ErlangenNurnberg, Faculty Medicine,...","universit erlangennurnberg, faculty medicine, ...","[faculty medicine, universit erlangennurnberg]","[nur universit, universit, friedrichalexanderu...",5,"[0.7071067811865475, 0.7071067811865475, 0.707...","[(universit erlangennurnberg, nur universit, 0...",{'universit erlangennurnberg': 5}
156,"Department Chemistry, University British Colum...","universit british columbia, vancouver, bc, can...",[universit british columbia],[universit british columbia],1,[1],"[(universit british columbia, universit britis...",{'universit british columbia': 1}
157,"Serra Hx00FAnter Fellow Programme, Universitat...","universit pompeu fabra, barcelona, spain",[universit pompeu fabra],[universit pompeu fabra],1,[1],"[(universit pompeu fabra, universit pompeu fab...",{'universit pompeu fabra': 1}
158,"Department Biomedical Engineering Mechanics, V...","virginia tech, blacksburg, va, usa",[virginia tech],[virginia tech],1,[1],"[(virginia tech, virginia tech, 1)]",{'virginia tech': 1}
159,"BCN MedTech, Department Information Communicat...","bcn medtech, universit pompeu fabra, barcelona...","[bcn medtech, universit pompeu fabra]",[universit pompeu fabra],1,[1],"[(universit pompeu fabra, universit pompeu fab...",{'universit pompeu fabra': 1}


In [1131]:
dixUnivEtcOpen2['department electronics information technology']	

'20|openorgs____::c481c821fc73db88dce3319b5f3e4811'

In [1169]:
len(result[1])/len(affDF)

0.6531049250535332

In [1170]:
result[0]

74.93857493857494

In [1171]:
m = result[1][['Original affiliations','Candidates for matching','Matched openAIRE names','Similarity score']]

In [1172]:
len(m)

305

In [1136]:

app = dp.App(#dp.Page(title="Matchings [explained]", blocks= [dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.DataTable(finalDF.drop(columns =['Candidates old']))]
             #),
             dp.Page(title="Matchings [final]", 
                     blocks= [dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.Table(m)]
             )
             )
    


   
app.save(path="sampleNA.html", open=True)

App saved to ./sampleNA.html

In [1137]:
m['Original affiliations'].iloc[71]

'Department Clinical Immunology, University Hospital, Copenhagen, Denmark'

In [1138]:
dixUnivEtcOpen2[ 'department electronics information technology']

'20|openorgs____::c481c821fc73db88dce3319b5f3e4811'

# affs -> excel 

In [1139]:
m.drop(columns = ['Candidates for matching']).to_excel('sampleN4.xlsx', index=False)


In [1140]:
dixOpenOrgId2['universit']

'20|openorgs____::8811a59d4b638fbf9952ebdcc0789516'

In [1141]:
h = dp.Table(m)
app = dp.App(dp.Page(title="Matchings [final]", blocks=[dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), h]))

# Save and open the App
app.save(path="friday.html", open=True)

App saved to ./friday.html

In [1142]:
dict_aff_open = {x: y for x, y in zip(result[1]['Original affiliations'], result[1]['Matched openAIRE names'])}
dict_aff_id = {x: y for x, y in zip(result[1]['Original affiliations'], result[1]['IDs'])}
dict_aff_score = {x: y for x, y in zip(result[1]['Original affiliations'], result[1]['Similarity score'])}

In [1143]:
pids = []
for i in range(len(doiDF)):
    pidsi = []
    for aff in doiDF['Unique affiliations'].iloc[i]:
        if aff in list(dict_aff_id.keys()):
            pidsi = pidsi + dict_aff_id[aff]
        elif 'unmatched organization(s)' not in pidsi:
            pidsi = pidsi + ['unmatched organization(s)']
    pids.append(pidsi)
            
            
        

In [1144]:
names = []
for i in range(len(doiDF)):
    namesi = []
    for aff in doiDF['Unique affiliations'].iloc[i]:
        if aff in list(dict_aff_open.keys()):
            namesi = namesi +  dict_aff_open[aff]
        elif 'unmatched organization(s)' not in namesi:
            namesi = namesi + ['unmatched organization(s)']
    names.append(namesi)

In [1145]:
scores = []
for i in range(len(doiDF)):
    scoresi = []
    for aff in doiDF['Unique affiliations'].iloc[i]:
        if aff in list(dict_aff_score.keys()):
            scoresi = scoresi +  dict_aff_score[aff]
        elif 'unmatched organization(s)' not in scoresi:
            scoresi = scoresi + ['-']
    scores.append(scoresi)

In [1146]:
doiDF['Matched openAIRE names'] = names
doiDF['IDs'] = pids
doiDF['Scores'] = scores

In [1147]:
unmatched = [i for i in range(len(doiDF)) if doiDF['Matched openAIRE names'].iloc[i] == ['unmatched organization(s)']]
        

In [1148]:
matched = [i for i in range(len(doiDF))  if i not in unmatched]

In [1149]:
(len(doiDF)-len(unmatched))/len(doiDF)

0.7410071942446043

In [1150]:
finalDF =  doiDF[['DOI',"Unique affiliations",'Matched openAIRE names','IDs', 'Scores']].iloc[matched]

In [1151]:
finalDF.reset_index(inplace = True)

In [1152]:
def update_Z(row):
    new_Z = []
    for i in range(len(row['IDs'])):
        entry = {'OpenAIREid': row['IDs'][i], 'Confidence': row['Scores'][i]}
        new_Z.append(entry)
    return new_Z

matching = finalDF.apply(update_Z, axis=1)

In [1153]:
finalDF.loc[:,'Matchings'] = matching

In [1154]:
finalDFShort = finalDF[['Unique affiliations','Matched openAIRE names','Scores']]

In [1155]:
findID('La Jolla Institute Allergy')

['la jolla institute allergy immunology']

# Output

In [1156]:
doiDF_output = finalDF[['DOI','Matchings']]


# JSON

In [1157]:
match0 = doiDF_output.to_json(orient='records', lines=True)

# Save the JSON to a file
with open('match0.json', 'w') as f:
    f.write(match0)

In [1158]:
df.to_excel('sampleFinal.xlsx', index=False)


# HTML

In [1159]:


df = finalDFShort

# Export DataFrame to CSV
df.to_csv('data.csv', index=False, quoting=1)

# Read CSV file into new DataFrame
df_with_quotes = pd.read_csv('data.csv')

# Create the table
table = dp.Table(df_with_quotes)

# Create the App
app = dp.App(dp.Page(title="Matchings [final]", blocks=[dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), table]))

# Save and open the App
app.save(path="sample1.html", open=True)


App saved to ./sample1.html

In [1160]:
DF = doiDF.iloc[matched]

In [1161]:
DF.reset_index(inplace = True)

In [1162]:
len(DF)

206

In [1163]:

app = dp.App(#dp.Page(title="Matchings [explained]", blocks= [dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.DataTable(finalDF.drop(columns =['Candidates old']))]
             #),
             dp.Page(title="Matchings [final]", 
                     blocks= [dp.Text('DOIs with aff. in Univ/Inst, Laboratories'), dp.Table(DF[['Unique affiliations', 'Matchings']])]
             )
             )
    


   
app.save(path="sampleNA.html", open=True)


KeyError: "['Matchings'] not in index"

# JSON

# Excel

# proxeiro

In [None]:
affDF

Unnamed: 0,Original Affiliations,Light Affiliations,Keywords,Dictionary,Category
0,College Bioresources Chemical and Materials En...,college bioresources chemical and materials en...,[college bioresources chemical and materials e...,[[college bioresources chemical and materials ...,Univ/Inst
1,Key Laboratory Special Functional and Smart Po...,"northwestern polytechnical universit, xian, sh...",[northwestern polytechnical universit],"[[northwestern polytechnical universit, Univ/I...",Univ/Inst
2,"School Mechanical Engineering and Automation, ...","school mechanical engineering and automation, ...","[harbin institute technology, school mechanica...",[[school mechanical engineering and automation...,Univ/Inst
3,"Shenzhen Polytechnic, Shenzhen518055, China","shenzhen polytechnic, shenzhen518055, china",[shenzhen polytechnic],"[[shenzhen polytechnic, Univ/Inst]]",Univ/Inst
4,College Physics and Optoelectronic Engineering...,"shenzhen universit, shenzhen518055, china",[shenzhen universit],"[[shenzhen universit, Univ/Inst]]",Univ/Inst
...,...,...,...,...,...
463,"Department Medicine, University California San...","universit california san diego, la jolla ca 92...",[universit california san diego],"[[universit california san diego, Univ/Inst]]",Univ/Inst
464,"Division Molecular Immunology, La Jolla Instit...","division molecular immunology, la jolla instit...",[la jolla institute allergy and immunology],"[[la jolla institute allergy and immunology, U...",Univ/Inst
465,"Institute Biochemistry, University Lausanne, E...","universit lausanne, epalinges, switzerland",[universit lausanne],"[[universit lausanne, Univ/Inst]]",Univ/Inst
466,"Department Immunology and Microbiology, Rush M...","rush medical college, chicago, il 60612",[rush medical college],"[[rush medical college, Univ/Inst], [rush medi...","Univ/Inst, Hospital"


In [None]:
 list(affDF.Keywords)

[['college bioresources chemical and materials engineering',
  'shaanxi universit science  technology'],
 ['northwestern polytechnical universit'],
 ['harbin institute technology',
  'school mechanical engineering and automation'],
 ['shenzhen polytechnic'],
 ['shenzhen universit'],
 ['communal institution higher education dnipro academy continuing education dnipropetrovsk regional council'],
 ['leiden universit hospital'],
 ['univ alabama birmingham'],
 ['univ colorado denver'],
 ['maharishi markandeshwar institute physiotherapy and rehabilitation'],
 ['maharishi markandeshwar institute physiotherapy and rehabilitation'],
 ['aksaray universit'],
 ['the cooperative research center vaccine technology',
  'royal brisbane hospital'],
 ['pan african universit institute governance'],
 ['masaryk universit'],
 ['national institutes health', 'clinical center'],
 ['national institute arthritis and musculoskeletal and skin diseases'],
 ['national institute child health and human development'],
 

In [None]:
for i in range(len(affDF)):
    if 'Universit Pompeu Fabra'.lower() in affDF.Keywords.iloc[i]:
        print(i)

242
244


In [None]:
doiDF[85:95]

Unnamed: 0,items,DOI,authors,# authors,affiliations,# Affil,uniqueAff,# uniqueAff,uniqueAff1,Unique affiliations,Matched openAIRE names,IDs,Scores
85,{'abstract': '<jats:title>Summary</jats:title>...,10.4049/jimmunol.108.3.845,"[{'given': 'Stanley', 'family': 'Yachnin', 'se...",1,"[[{'name': 'Department of Medicine, The Univer...",1,"[Argonne Cancer Research Hospital , and the , ...",2,"[Argonne Cancer Research Hospital , and , 1 Ch...","[Argonne Cancer Research Hospital , and , 1 Ch...","[unmatched organization(s), universit chicago]","[unmatched organization(s), 20|openorgs____::0...","[-, 0.816496580927726]"
86,{'URL': 'http://dx.doi.org/10.36535/0236-1914-...,10.36535/0236-1914-2022-07-5,"[{'given': 'Alik A.', 'family': 'Chebotaev', '...",4,[[{'name': 'Scientific Center of Complex Trans...,4,[Moscow Automobile and Road Construction State...,2,[Moscow Automobile and Road Construction State...,[Scientific Center Complex Transport Problems ...,"[unmatched organization(s), moscow automobile ...","[unmatched organization(s), 20|openorgs____::9...","[-, 0.7745966692414834]"
87,{'URL': 'http://dx.doi.org/10.1109/gtsd54989.2...,10.1109/gtsd54989.2022.9989196,"[{'given': 'M.', 'family': 'Pallavi', 'sequenc...",5,"[[{'name': 'Manipal Institute of Technology,De...",5,"[Manipal Institute of Technology,Department of...",4,"[Manipal Institute Technology,Department Aeron...","[Manipal Institute Technology,Department ECE,U...","[unmatched organization(s), manipal universit]","[unmatched organization(s), 20|openorgs____::e...","[-, -, -, 1]"
88,{'URL': 'http://dx.doi.org/10.1109/icus55513.2...,10.1109/icus55513.2022.9987042,"[{'given': 'Yunqi', 'family': 'Cheng', 'sequen...",4,[[{'name': 'College of Artificial Intelligence...,4,"[College of Artificial Intelligence, Nankai Un...",2,"[College Artificial Intelligence, Nankai Unive...","[School Vehicle and Mobility, Tsinghua Univers...","[tsinghua universit, nankai universit]",[20|openorgs____::343e897b3a9e6f6facaf57aec940...,"[1, 1]"
89,{'abstract': '<jats:p>The aim: To evaluate the...,10.36740/wlek202211125,"[{'given': 'Roman A.', 'family': 'Zhuravchak',...",3,[[{'name': 'STATE INSTITUTION OF SCIENCE «RESE...,3,[STATE INSTITUTION OF SCIENCE «RESEARCH AND PR...,1,[STATE INSTITUTION OF SCIENCE RESEARCH AND PRA...,[STATE INSTITUTION OF SCIENCE RESEARCH AND PRA...,[unmatched organization(s)],[unmatched organization(s)],[-]
90,{'abstract': '<jats:p> ▪ Abstract The field o...,10.1146/annurev.biophys.31.101101.140930,"[{'given': 'Boris M.', 'family': 'Slepchenko',...",4,[[{'name': 'Center for Biomedical Imaging Tech...,4,"[Center for Biomedical Imaging Technology, Uni...",1,"[Center Biomedical Imaging Technology, Univers...","[Center Biomedical Imaging Technology, Univers...",[universit connecticut health center],[20|openorgs____::2439cf05fc5ef8ee1f975b76a552...,[1]
91,{'abstract': '<jats:title>Abstract</jats:title...,10.4049/jimmunol.157.5.1840,"[{'given': 'Z K', 'family': 'Ballas', 'sequenc...",3,"[[{'name': 'Department of Internal Medicine, U...",3,"[Department of Internal Medicine, University o...",1,"[Department Internal Medicine, University Iowa...","[Department Internal Medicine, University Iowa...",[universit iowa],[20|openorgs____::7a628b9647b55a6f498d19a2c7ea...,[0.7071067811865475]
92,{'URL': 'http://dx.doi.org/10.1055/s-0042-1759...,10.1055/s-0042-1759813,"[{'given': 'Isaac', 'family': 'Golden', 'seque...",1,[[{'name': 'Human Research and Ethics Committe...,1,"[Human Research and Ethics Committee, National...",1,"[Human Research and Ethics Committee, National...","[Human Research and Ethics Committee, National...",[national institute integrative medicine],[20|openorgs____::a532f2cb72908ebd6f7868ade898...,[1]
93,{'URL': 'http://dx.doi.org/10.47576/2712-7559_...,10.47576/2712-7559_2022_5_5_438,"[{'given': 'Zinaida Magomedovna', 'family': 'Z...",3,[[{'name': 'Chechen State University Named aft...,3,[Chechen State University Named after A. A. Ka...,2,[Chechen State University Named after A A Kady...,[Chechen State University Named after A A Kady...,"[chechen state universit, dagestan state unive...",[20|openorgs____::5c57fc3a0c473a02a5e5a663bdc2...,"[0.7071067811865477, 1]"
94,{'abstract': '<jats:title>Abstract</jats:title...,10.4049/jimmunol.118.4.1335,"[{'given': 'Philip R. B.', 'family': 'McMaster...",3,"[[{'name': 'Laboratory of Microbial Immunity, ...",3,"[Laboratory of Microbial Immunity, National In...",1,"[Laboratory Microbial Immunity, National Insti...","[Laboratory Microbial Immunity, National Insti...",[national institutes health],[20|openorgs____::4adb23846cbb6dd772b7d6b3131a...,[0.8660254037844388]


In [None]:
affDF[240:245]

Unnamed: 0,Original Affiliations,Light Affiliations,Keywords,Dictionary,Category
240,"Division Dermatology, UCLA School Medicine 90024","division dermatology, ucla school medicine 90024",[ucla school medicine 90024],"[[ucla school medicine 90024, Univ/Inst]]",Univ/Inst
241,"Department Chemistry, University British Colum...","universit british columbia, vancouver, bc, can...",[universit british columbia],"[[universit british columbia, Univ/Inst]]",Univ/Inst
242,"Serra Hx00FAnter Fellow Programme, Universitat...","universit pompeu fabra, barcelona, spain",[universit pompeu fabra],"[[universit pompeu fabra, Univ/Inst]]",Univ/Inst
243,Department Biomedical Engineering and Mechanic...,department biomedical engineering and mechanic...,[department biomedical engineering and mechanics],[[department biomedical engineering and mechan...,Hospital
244,"BCN MedTech, Department Information and Commun...","bcn medtech, universit pompeu fabra, barcelona...",[universit pompeu fabra],"[[universit pompeu fabra, Univ/Inst]]",Univ/Inst


In [None]:
findID('department')

['new south wales department primary industries',
 'yunnan provincial department transportation',
 'nenjiang department northeast agricultural institute',
 'alabama department public health',
 'los angeles county department public health',
 'alabama department agriculture industries',
 'baltimore county department health',
 'douglas county health department',
 'county sonoma department health services',
 'science technology department zhejiang province',
 'emergency department nurses association',
 'department health',
 'zhejiang provincial public security department',
 'department technical adult education',
 'higher technical education department',
 'macoupin county public health department',
 'dade county medical examiner department',
 'davidson county health department',
 'department communities local government',
 'district department transportation',
 'michigan department technology management budget',
 'anson county health department',
 'madison county health department',
 'keno

In [None]:
dixOpenOrgId2['Virginia Tech']

KeyError: 'Virginia Tech'

In [None]:
vectorizer = CountVectorizer()


x = 'vrije universit medical center'
s = 'vrije universit'

s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()

# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
similarity

# NTD

In [None]:
l1 = ['p e',
 'dept civ engrg, seaton hall, kansas state univ, manhattan, ks email reddiceksuedu',
 'postdoctoral res assoc, dept civ engrg, seaton hall, kansas state univ, manhattan, ks 66506']

l2 = 1

l3 = [('kansas state univ', 'kansas state universit', 0.816496580927726),
 ('kansas state univ', 'kansas state universit salina', 0.7071067811865475)]

l4 = {'kansas state univ': 2}

simU = 0.7

simG = 0.82

In [None]:
import os

folder_path = '/Users/myrto/Downloads/output-v1'  # Specify the path to your folder here

file_range = range(28189)  # Specify the range of file names to check (0 to 1000 in this case)

existing_files = set(os.listdir(folder_path))  # Get the set of existing file names in the folder

missing_files = [f"{i}.json" for i in file_range if f"{i}.json" not in existing_files]

print(missing_files)
