In [3]:
## Import packages

import pandas as pd
import re
##import unicodedata
import html
import sys

from collections import defaultdict

import pickle

## Import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Import functions -- main algorithm

sys.path.append('..')

from create_input import *
from affro import *

In [5]:
# Import ROR-related dictionaries 

with open('../dictionaries/dix_acad.pkl', 'rb') as f:
    dix_acad = pickle.load(f)

with open('../dictionaries/dix_mult.pkl', 'rb') as f:
    dix_mult = pickle.load(f)

with open('../dictionaries/dix_city.pkl', 'rb') as f:
    dix_city = pickle.load(f)
    
with open('../dictionaries/dix_country.pkl', 'rb') as f:
    dix_country = pickle.load(f)


In [6]:
## Upload json file

file = 'sample.json'

crossref_df = pd.read_json(file, orient='records')

## And... let the cleaning begin. 

authors = [i for i in range(len(crossref_df)) if 'author'  in crossref_df['items'][i]]

crossref_auth = crossref_df.iloc[authors].copy()

crossref_auth.reset_index(inplace= True)
crossref_auth.drop(columns = ['index'], inplace = True)

crossref_auth.loc[:, 'DOI'] = crossref_auth['items'].apply(lambda x: x['DOI'])
crossref_auth.loc[:,'authors'] = crossref_auth['items'].apply(lambda x: x['author'])

def getAff(k):
   return [crossref_auth['authors'][k][j]['affiliation'] for j in range(len(crossref_auth['authors'][k]))]
    
affiliations = [getAff(k) for k in range(len(crossref_auth))]

crossref_auth.loc[:,'affiliations'] = affiliations


## Clean 'empty' affiliations

possible_empty_aff = []

for k in range(len(crossref_auth)):
    if len(crossref_auth['affiliations'][k][0]) == 0:
        possible_empty_aff.append(k)
        
non_empty_aff = []

for k in possible_empty_aff:
    for j in range(len(crossref_auth['affiliations'].iloc[k])):
        if len(crossref_auth['affiliations'].iloc[k][j]) != 0:
            non_empty_aff.append(k)
    
final_emptyy_aff =  [x for x in possible_empty_aff if x not in non_empty_aff] 
final_non_empty_aff = [x for x in range(len(crossref_auth)) if x not in final_emptyy_aff]


# doi_df: crossref_auth subdataframe with nonpempty affiliation lists

doi_df = crossref_auth.iloc[final_non_empty_aff].copy()
doi_df.reset_index(inplace = True)
doi_df.drop(columns = ['index'], inplace = True)

# (still some cleaning: cases with empty brackets [{}])

empty_brackets = [k for k in range(len(doi_df)) if len(doi_df['affiliations'][k][0]) != 0 and doi_df['affiliations'][k][0][0] == {}]
doi_df.iloc[empty_brackets]
doi_df.drop(empty_brackets, inplace = True)

doi_df.reset_index(inplace = True)
doi_df.drop(columns = ['index'], inplace = True)


# 1. "Unique" affiliations --- number of unique affiliations

unique_aff = []
error_indices =[] # New list to store error indices
for i in range(len(doi_df)):
    try:
        unique_aff.append(list(set([x[0] for x in [list(d.values()) for d in [item for sublist in doi_df['affiliations'].iloc[i] for item in sublist if sublist !=[{}] and item !={}]]])))
    except TypeError:
        print("Error occurred for i =", i)
        error_indices.append(i)  # Save the index where the error occurred
    #except IndexError:
     #   print("IndexError occurred for i =", i)
      #  error_indices.append(i)  # Save the index where the IndexError occurred




In [7]:
doi_df.drop(error_indices, inplace = True)
doi_df.reset_index(inplace = True)
doi_df.drop(columns = ['index'], inplace = True)

doi_df.loc[:,'unique_aff'] = unique_aff

#num_unique_aff = [len(doi_df['unique_aff'].iloc[i]) for i in range(len(doi_df))]

#doi_df.loc[:,'# unique_aff'] = num_unique_aff

new_aff0 = []

for k in range(len(doi_df)):
    
    L2 = []
    for s1 in doi_df['unique_aff'].iloc[k]:
        is_substring = False
        for s2 in doi_df['unique_aff'].iloc[k]:
            if s1 != s2 and s1 in s2:
                is_substring = True
                break
        if not is_substring:
            L2.append(s1)
    new_aff0.append(L2)
    
new_aff_list = [list(set(new_aff0[k])) for k in range(len(new_aff0))]
doi_df['Unique affiliations'] = new_aff_list


In [8]:
academia_df = create_df_algorithm(doi_df)

In [9]:
if len(doi_df) > 0:
    academia_df = create_df_algorithm(doi_df)
else:
    academia_df= pd.DataFrame(columns = ['Original affiliations', 'Light affiliations', 'Keywords', 'Dictionary', 'Category'])
    

    

In [10]:
if len(academia_df)>0:   
    result = Aff_Ids(len(academia_df), academia_df,dix_acad, dix_mult, dix_city, dix_country, 0.65,0.82)


# prepare the outputs 


In [11]:
if len(result)>0:

    affs_match = result[['Original affiliations','Matched organizations', 'unique ROR']]

    dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
    dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
    #dict_aff_score = {x: y for x, y in zip(result['Original affiliations'], result['Similarity score'])}

    dict_aff_score = {}
    for i in range(len(result)):
        if type(result['Similarity score'].iloc[i]) == list:
            dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
        else:
            dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
            

    pids = []
    for i in range(len(doi_df)):
        pidsi = []
        for aff in doi_df['Unique affiliations'].iloc[i]:
            if aff in list(dict_aff_id.keys()):
                pidsi = pidsi + dict_aff_id[aff]
        # elif 'unmatched organization(s)' not in pidsi:
        #     pidsi = pidsi + ['unmatched organization(s)']
        pids.append(pidsi)
                
                
    names = []
    for i in range(len(doi_df)):
        namesi = []
        for aff in doi_df['Unique affiliations'].iloc[i]:
            if aff in list(dict_aff_open.keys()):
                try:
                    namesi = namesi + dict_aff_open[aff]
                except TypeError:
                    namesi = namesi + [dict_aff_open[aff]]
                
        names.append(namesi)
        
    scores = []
    for i in range(len(doi_df)):
        scoresi = []
        for aff in doi_df['Unique affiliations'].iloc[i]:
            if aff in list(dict_aff_score.keys()):
                scoresi = scoresi +  dict_aff_score[aff]
                
        scores.append(scoresi)
        
        
    doi_df['Matched organizations'] = names
    doi_df['ROR'] = pids
    doi_df['Scores'] = scores


    unmatched = [i for i in range(len(doi_df)) if doi_df['Matched organizations'].iloc[i] == []]
            
    matched = [i for i in range(len(doi_df))  if i not in unmatched]


    final_df0 =  doi_df.iloc[matched].copy()
    final_df0.reset_index(inplace = True)

    final_df = final_df0[['DOI',"Unique affiliations",'Matched organizations','ROR', 'Scores']].copy()

    def update_Z(row):
        if len(row['ROR']) == 0 or len(row['Scores']) == 0:
            return []
        
        new_Z = []
        for ror, score in zip(row['ROR'], row['Scores']):
            entry = {'RORid': ror, 'Confidence': score}
            new_Z.append(entry)
        return new_Z

    matching = final_df.apply(update_Z, axis=1)

    final_df['Matchings'] = matching

    final_df_short = final_df[['Unique affiliations','Matched organizations','ROR','Scores']]

    # 3. JSON [Final output]


    doi_df_output = final_df[['DOI','Matchings']]

    dois_match = doi_df_output.to_json(orient='records', lines=True)

    # Save the JSON to a file
    with open('dois_match.json', 'w') as f:
        f.write(dois_match)
    

In [13]:
affs_match.to_csv('affs_match.csv', index=False) 
final_df_short.to_csv('dois_match.csv', index=False) 
