In [8]:
from datetime import datetime
import pandas as pd

import bibtexparser

In [9]:
with open('publications.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

In [10]:
org_data = pd.read_csv('collab-data/coa-name-org.csv')
org_data['first_initial'] = org_data.first_name.apply(lambda s: s.strip()[0])
org_data

Unnamed: 0,last_name,first_name,organization,first_initial
0,Abernathy,Ryan,Columbia,R
1,Apel,E. C.,NCAR,E
2,Asher,Elizabeth,NCAR,E
3,Atlas,E. L.,University of Miami,E
4,Aumont,Olivier,"LOCEAN, CNRS",O
...,...,...,...,...
151,Xie,Shang-Ping,Scripps,S
152,Yang,Simon,UCLA,S
153,Yang,Bo,NOAA,B
154,Yashayaev,Igor,Bedford,I


In [11]:
advisor_advisee = pd.read_csv('collab-data/advisors-advisees.csv')
advisor_advisee['first_initial'] = advisor_advisee.first_name.apply(lambda s: s.strip()[0])
advisor_advisee

Unnamed: 0,last_name,first_name,organization,first_initial
0,Dunbar,Robert,Stanford University,R
1,Arrigo,Kevin,Stanford University,K
2,Caldeira,Ken,Stanford University,K
3,Eddebar,Yassir,Scripps University,Y
4,Rohr,Tyler,MIT/WHOI,T
5,Frank,Boudinot,"University of Colorado, Boulder",B
6,Brady,Riley,"University of Colorado, Boulder",R
7,Cantarero,Sebastian,"University of Colorado, Boulder",S
8,Sylvester,Zephyr,"University of Colorado, Boulder",Z


In [20]:
since_year = datetime.now().year - 3

my_names = [
    r'\textbfLong, \textbfM \textbfC',
    r'textbf{Long}}, {\textbf{M}}~{\textbf{C}}',
]

special_char = {    
    r'$^\dagger$': '',
    r'$^*$': '',
    r'\"o': 'ö',
    r"\'e": 'é',
    r"\'o": 'ó',
    r'\"o': 'ö',
    r'\"e': 'ë',
    r"\'\i": 'í',
}

def filter_author(author_entry):
    author_list = [s.strip()
                   .replace('{', '')
                   .replace('}', '')
                   .replace('~', ' ') 
                   .replace('et al.', '')
                   for s in entry['author'].split(' and ')]
    
    for s in my_names:
        if s in author_list:
            author_list.remove(s)
    
    for i, name in enumerate(author_list):
        for find, replace in special_char.items():
            if find in name:
                author_list[i] = name.replace(find, replace)
    
    author_list = [s for s in author_list if s]
    
    for i, name in enumerate(author_list):
        if ',' not in author_list[i]:
            tmp_list= author_list[i].split(' ')
            name_last = tmp_list[-1]
            name_first_mid = " ".join(tmp_list[:-1])
            author_list[i] = f'{name_last}, {name_first_mid}'            
    
    return author_list

authors = []
for entry in bib_database.entries:
    if int(entry['year']) >= since_year:
        author_i = filter_author(entry['author'])
        if len(author_i) > 20:
            continue
        authors.extend(author_i)

authors = sorted(list(set(authors)))

rows = []
for i, name in enumerate(authors):
    last_name = name.split(',')[0].strip()
    first_name = name.split(',')[1].strip()
    initials = ' '.join([s[0] for s in first_name.split(' ')])
    first_initial = initials[0]
    
    sel = (org_data.last_name == last_name) & (org_data.first_initial == first_initial)
    if any(sel):
        org = org_data.loc[sel].organization.values[0]
    else:
        org = 'missing'
    
    
    rows.append(dict(
        first_initial=first_initial,
        initials=initials,
        last_name=last_name,
        first_name=first_name, 
        organization=org,
    ))
    
df = pd.DataFrame(rows)
df = pd.concat((df, advisor_advisee)).sort_values(by="last_name")
df = df.drop_duplicates(subset=["first_initial", "last_name"])
df

Unnamed: 0,first_initial,initials,last_name,first_name,organization
0,E,E C,Apel,E. C.,NCAR
1,K,,Arrigo,Kevin,Stanford University
2,E,E,Asher,E.,NCAR
3,E,E L,Atlas,E. L.,University of Miami
4,O,O,Aumont,Olivier,"LOCEAN, CNRS"
...,...,...,...,...,...
180,R,R,Wang,R.,missing
184,D,D B,Whitt,Daniel B,NASA/AIMES
185,S,S,Xie,Shang-Ping,Scripps
186,B,B,Yang,Bo,NOAA


In [21]:
df.loc[df.organization == "missing"].to_csv('collab-data/missing_org.csv')
df.loc[df.organization == "missing"]

Unnamed: 0,first_initial,initials,last_name,first_name,organization
24,C,C A,Cuevas,Carlos A.,missing
41,R,R P,Fernandez,Rafael P.,missing
57,N,N,Haëntjens,Nils,missing
59,A,A J,Hills,Alan J.,missing
98,Q,Q,Li,Qingfeng,missing
126,F,F,Moore,Fred,missing
132,E,E J,Morgan,Eric J.,missing
135,J,J,Nishioka,J,missing
136,C,C,Nissen,C.,missing
137,H,H,Obata,H,missing


In [22]:
columns = ["last_name", "first_name", "organization"]
keep = df.organization != "missing"
df.loc[keep][columns].to_csv("collab-data/coa-output.csv", index=False)
df.loc[keep][columns]

Unnamed: 0,last_name,first_name,organization
0,Apel,E. C.,NCAR
1,Arrigo,Kevin,Stanford University
2,Asher,E.,NCAR
3,Atlas,E. L.,University of Miami
4,Aumont,Olivier,"LOCEAN, CNRS"
...,...,...,...
181,Wang,Siyuan,NOAA
184,Whitt,Daniel B,NASA/AIMES
185,Xie,Shang-Ping,Scripps
186,Yang,Bo,NOAA
