In [22]:
from time import gmtime, strftime
strftime('Late update %Y-%m-%d %H:%M:%S UT', gmtime())

'Late update 2020-06-02 21:49:27 UT'

In [23]:
# Import packages 
from astroquery.alma import Alma
from astroquery import nasa_ads as na

import urllib
import xml.etree.ElementTree as ET

import pandas as pd

import numpy as np
import re

In [24]:
def getvalueofnode(node):
    """ return node text or None """
    return node.text if node is not None else None

def get_parse_xml(url):
    req = urllib.request.Request(url)

    with urllib.request.urlopen(req) as response:
        XmlData = response.read()
  
    root = ET.fromstring(XmlData)
  
    return root


def gettaglist(root_xml):
    taglist = []
    for items in root_xml:
        for metad in items:
            taglist.append(metad.tag)
    taglist= list(set(taglist))

    return taglist

def generateDF():
    url='http://telbib.eso.org/api.php?telescope[]=ALMA'
    
    root = get_parse_xml(url)

    numFound = int(root[0].text)
    print("Total number of entry is ", numFound)

    #taglist = gettaglist(root)
    taglist =['authors', 'year', 'programids', 'bibcode', 'journal']

    tag_multientory = ['authors','programids']
    
    def xml2df(root,df):
        for article in root[1:]:
            tmp = {}
            for metad in article:
                for tagl in taglist:
                    if metad.tag == tagl:
                        if metad.tag in tag_multientory:
                            t = ''
                            for child in metad:
                                # t.append(child.text.rstrip().lstrip())
                                t += ':{:}'.format(child.text.rstrip().lstrip())
                        else:
                            t = metad.text
                        tmp.update({tagl: t})            
            df = df.append(tmp,ignore_index=True)
        return df

    df_xml = pd.DataFrame(columns=taglist)

    df_xml = xml2df(root,df_xml)

    maxhit = 500
    if numFound > maxhit:
        for i in range(1,int(numFound/maxhit)+1):
            url2 = url + '&start={:}'.format(i*maxhit)
            print('Quering: {:}'.format(url2))
            root = get_parse_xml(url2)
            #print(root)
            df_xml = xml2df(root,df_xml)
            
    return df_xml


# 0. Obtain list of ALMA publications from telbib

In [26]:
#df_xml = generateDF()

# If you want to update the list.
#df_xml.to_pickle('./df_telbib_alma.pkl')

df_xml = pd.read_pickle('./df_telbib_alma.pkl')

In [27]:
df_xml.head()

Unnamed: 0,authors,year,programids,bibcode,journal
0,":Oteo, I.:Zwaan, M. A.:Ivison, R. J.:Smail, I....",2016,":2012.1.00076.S , ALMA-Partner:...",2016ApJ...822...36O,ApJ
1,":Klitsch, Anne:Péroux, Céline:Zwaan, Martin A....",2019,":2017.1.00466.S , ALMA-Partner:...",2019MNRAS.490.1220K,MNRAS
2,":Liu, Daizhong:Schinnerer, E.:Groves, B.:Magne...",2019,":2011.0.00064.S , ALMA-Partner:...",2019ApJ...887..235L,ApJ
3,":Liu, Daizhong:Lang, P.:Magnelli, B.:Schinnere...",2019,":2011.0.00064.S , ALMA-Partner:...",2019ApJS..244...40L,ApJS
4,":Fudamoto, Yoshinobu:Oesch, P. A.:Magnelli, B....",2020,":2011.0.00064.S , ALMA-Partner:...",2020MNRAS.491.4724F,MNRAS


### 1. Update dataframe to sort out only ALMA program id

In [10]:
def updateprogramids(df,match_column='bibcode'):
    # pattern matchign "20dd.?.ddddd.???"
    ptn = r'(?P<programids>20\d{2}\..\.\d{5}\.\S{1,3})'
    # extract only ALMA project code
    df1 = df.set_index(match_column).programids.str.extractall(ptn).reset_index(0).drop_duplicates().groupby(by=match_column).programids.apply(','.join).to_frame()
    # Before merging table, remove original programids
    df0 = df.drop(columns='programids')
    return df0.join(df1, on=match_column)

In [11]:
df_xml = updateprogramids(df_xml)
df_xml.head()

Unnamed: 0,authors,year,bibcode,journal,programids
0,":Oteo, I.:Zwaan, M. A.:Ivison, R. J.:Smail, I....",2016,2016ApJ...822...36O,ApJ,"2012.1.00076.S,2012.1.00080.S,2012.1.00088.S,2..."
1,":Klitsch, Anne:Péroux, Céline:Zwaan, Martin A....",2019,2019MNRAS.490.1220K,MNRAS,"2017.1.00466.S,2017.1.00467.S,2017.1.00471.S,2..."
2,":Liu, Daizhong:Schinnerer, E.:Groves, B.:Magne...",2019,2019ApJ...887..235L,ApJ,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2..."
3,":Liu, Daizhong:Lang, P.:Magnelli, B.:Schinnere...",2019,2019ApJS..244...40L,ApJS,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2..."
4,":Fudamoto, Yoshinobu:Oesch, P. A.:Magnelli, B....",2020,2020MNRAS.491.4724F,MNRAS,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2..."


### 2. Update dataframe to add affiliations to dataframe

In [14]:
na.ADS.TOKEN = 'your-token-here'

def addaffiliations(df):
    aff = []
    for i in df.index:
        bibcode = df.bibcode[i]
        q = na.ADS.query_simple(bibcode)
        q = q[q['bibcode'] == bibcode]
        aff.append(q['aff'][0])
    return aff

In [15]:
%%time

df_xml['affiliations'] = addaffiliations(df_xml)
df_xml.head()

CPU times: user 6.81 s, sys: 629 ms, total: 7.44 s
Wall time: 8.62 s


Unnamed: 0,authors,year,bibcode,journal,programids,affiliations
0,":Oteo, I.:Zwaan, M. A.:Ivison, R. J.:Smail, I....",2016,2016ApJ...822...36O,ApJ,"2012.1.00076.S,2012.1.00080.S,2012.1.00088.S,2...","[Institute for Astronomy, University of Edinbu..."
1,":Klitsch, Anne:Péroux, Céline:Zwaan, Martin A....",2019,2019MNRAS.490.1220K,MNRAS,"2017.1.00466.S,2017.1.00467.S,2017.1.00471.S,2...","[European Southern Observatory, Karl-Schwarzsc..."
2,":Liu, Daizhong:Schinnerer, E.:Groves, B.:Magne...",2019,2019ApJ...887..235L,ApJ,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2...","[Max-Planck-Institut für Astronomie, Königstuh..."
3,":Liu, Daizhong:Lang, P.:Magnelli, B.:Schinnere...",2019,2019ApJS..244...40L,ApJS,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2...","[Max-Planck-Institut für Astronomie, Königstuh..."
4,":Fudamoto, Yoshinobu:Oesch, P. A.:Magnelli, B....",2020,2020MNRAS.491.4724F,MNRAS,"2011.0.00064.S,2011.0.00097.S,2011.0.00539.S,2...","[Department of Astronomy, University of Geneva..."


In [16]:
#df_xml.to_pickle('./df_telbib_alma_aff.pkl')
df_xml = pd.read_pickle('./df_telbib_alma_aff.pkl')
#df_xml.head()

# 3. Add countries per author
## You may need manual process because the affiliations are not completed somtimes.

In [201]:

def addcountries(df):
    countries = []
    for i in df.index:
        _tmp = []
        for auth,affname in enumerate(df.iloc[i].affiliations):
            countryname0 = affname.replace('&amp;','').replace('&gt;','').replace('&lt;','').replace('CONICET;','').replace('IPAG;','').split(';')[0].split(',')
            countryname  = countryname0[-1].replace('the ','The ').replace('.','').strip()
            if (countryname == '-'):
                if ('Partnership' in df['authors'][i].split(':')[auth+1]) or ('Collaboration' in df['authors'][i].split(':')[auth+1]):
                    countryname = 'Collaboration'
                elif ('Team' in df['authors'][i].split(':')[auth+1]):
                    countryname = 'Collaboration'
            if countryname == '':
                print(i,auth,'WhiteSpace')
                countryname = countryname0[-2].replace('the ','The ').replace('.','').strip()
                if countryname == '':
                    countryname = countryname0[-3].replace('the ','The ').replace('.','').strip()
            if 'Corresponding author' in countryname:
                countryname0 = affname
                countryname = affname.split(' ')[-1].replace(';','')
            if '&lt' in countryname:
                countryname = countryname0[-2].replace('the ','The ').replace('.','').strip()
                print(countryname)
            # Check Taiwan or China
            if ('ROC' in countryname) or ('R.O.C' in countryname) or ('China' in countryname):
                #print('CHANGE: ', countryname, end='->')
                countryname2 = countryname0[-2].replace('the ','The ').replace('.','').strip()
                if ('Taiwan' in countryname) or ('Taiwan' in countryname2):
                    countryname = 'Taiwan'
                elif ('Taiwan' not in countryname2):
                    countryname = 'China'
                else:
                    print(i,auth,'Not China nor Taiwan',countryname)
                #print(countryname)
            # Check blannk
            if ('UK' in countryname) or ('United Kingdom' 
                    in countryname) or ('DH1 3LE' in countryname) or ('London' 
                    in countryname) or ('EH9 3HJ' in countryname) or ('WC1E 6BT' 
                    in countryname) or ('CB3 0HA' in countryname):
                countryname = 'UK'
            elif ('United States' in countryname) or ('USA' in countryname):
                countryname = 'USA'
            elif 'Korea' in countryname:
                countryname = 'Korea'
            elif ('México' in countryname) or ('MÉxico' in countryname):
                countryname = 'Mexico'
            elif ('Japan' in countryname) or ('Tokyo' in countryname) or ('Japa' 
                        in countryname) or ('Nagano' in countryname) or ('Chiba' 
                        in countryname) or ('Niigata' in countryname) or ('Ibaraki' 
                        in countryname) or ('Sagamihara' in countryname) or ('Ehime' 
                        in countryname) or ('Kyoto' in countryname) or ('Osaka'  
                        in countryname) or ('Nagoya' in countryname) or ('Kanagawa'  
                        in countryname) or ('Hokkaido'                                                  
                        in countryname) or ('Kavli Institute for The Physics of The universe'
                        in countryname) or ('NAOJ' in countryname):
                countryname = 'Japan'
            elif ('France' in countryname) or ('Pessac' in countryname) or ('Toulouse' 
                        in countryname) or ('École Normale Supérieure' in countryname):
                countryname = 'France'
            elif ('Italia' in countryname) or ('Italy' in countryname) or ('Vienna' in countryname):
                countryname = 'Italy'
            elif 'Netherland' in countryname:
                countryname = 'Netherlands'
            elif ('Spain' in countryname) or ('CSIC' in countryname):
                countryname = 'Spain'
            elif 'Canada' in countryname:
                countryname = 'Canada'
            elif ('Australia' in countryname) or ('ASTRO 3D' in countryname):
                countryname = 'Australia'
            elif 'Viet Nam' in countryname:
                countryname = 'Vietnam'
            elif ('Chile' in countryname) or ('Santiago' in countryname) or ('JAO' in countryname):
                countryname = 'Chile'
            elif ('ASIAA' in countryname) or ('Taïwan' in countryname):
                countryname = 'Taiwan'
            elif ('Lyngby' in countryname) or ('Denmark' in countryname):
                countryname = 'Denmark'
            elif ('Indonesia' in countryname):
                countryname = 'Indonesia'
            elif 'Taipei' in countryname:
                countryname = 'Taiwan'
            elif ('CfA' in countryname) or \
                 ('Berkeley' in countryname) or \
                 ('Virginia' in countryname) or \
                 ('National Optical Astronomy Observatory' in countryname) or \
                 ('Kavli Institute for Particle Astrophysics and Cosmology and Department of Particle Physics and Astrophysics' in countryname) or \
                 ('Colorado' in countryname) or \
                 ('Las Vegas' in countryname) or \
                 ('Einstein Fellow' in countryname) or \
                 ('Lyman Spitzer Jr Fellow' in countryname) or \
                 ('NHFP Sagan Fellow' in countryname) or \
                 ('NSF Astronomy and Astrophysics Postdoctoral Fellow' in countryname) or\
                 ('The National Radio Astronomy Observatory' in countryname):
                countryname = 'USA'
            elif ('WPI' in countryname):
                countryname = 'Japan'
            elif ('Beograd' in countryname):
                countryname = 'Serbia'
            elif ('Brasil' in countryname):
                countryname = 'Brazil'
            elif ('Germany' in countryname) or \
                 ('Germnay' in countryname) or \
                 ('Humboldt Research Fellow' in countryname):
                countryname = 'Germany'
            elif ('NZ' in countryname):
                countryname = 'New Zealand'                
            elif 'Russia' in countryname:
                countryname = 'Russia'
            elif ('Swarnajayanti' in countryname): 
                countryname = 'India'
            elif ('Onsala' in countryname):
                countryname = 'Sweden'
            elif re.match(r'\d+', countryname): #or ('' in countryname):
                try:
                    countryname2 = countryname0[-2].replace('the ','The ').replace('.','')
                    print(i,auth,countryname, end='->')
                    countryname = countryname2.strip()
                    print(countryname)
                except IndexError:
                    print(i,auth,'IndexError',countryname0)
            elif countryname not in set(appendix_df.index):
                print(i,countryname)
            _tmp.append(countryname)
        countries.append(_tmp)
    return countries


In [203]:
df_xml['countries'] = addcountries(df_xml)

821 0 WhiteSpace
875 0 WhiteSpace
875 1 WhiteSpace
1443 0 WhiteSpace
1443 1 WhiteSpace
1443 2 WhiteSpace
1746 0 WhiteSpace
1856 0 34055->Korea
1856 6 34055->Korea


In [205]:
appendix_df = pd.read_pickle('./df_region.pkl')

uniquecountries = []
for i in df_xml.countries:
    uniquecountries.extend(i)
    
set(uniquecountries)^set(appendix_df.index)

{'Unknown'}

In [206]:
def updatedf_country(bibcode,countrylist,df=df_xml):
    rowidx = df.index[df.bibcode==bibcode][0]
    df.countries.iloc[int(rowidx)] = countrylist


In [207]:

updatedf_country('2016ApJ...822L..10I',['USA', 'Japan', 'Japan'],df=df_xml)
updatedf_country('2014ApJ...786..114L',['Taiwan', 'Taiwan', 'USA', 'Taiwan', 'Taiwan', 'Taiwan'],df=df_xml)
updatedf_country('2014AJ....148....9I',['USA','Japan'],df=df_xml)
updatedf_country('2015Sci...348..311M',['Sweden','Unknown','Unknown','Unknown','Sweden'],df=df_xml)
updatedf_country('2017SciA....3E0022P',['USA','USA','USA','USA','UK','Poland','UK','USA'],df=df_xml)
updatedf_country('2017A&A...606A..53P',['France', 'France', 'France', 'France', 'France', 'France'],
                 df=df_xml)
updatedf_country('2013A&A...555A.112P',['France', 'UK', 'France', 'UK', 'France', 'UK', 'France', 'France', 'France', 'France', 'Italy'],
                 df=df_xml)
updatedf_country('2017AdAst2017E...6A',['Malaysia', 'Malaysia', 'Taiwan', 'Malaysia'])
updatedf_country('2013AJ....146...91I',['USA', 'Japan'])
updatedf_country('2018Galax...6...94C',['Spain', 'Spain', 'Spain', 'France'])
updatedf_country('2017Sci...355.1285N',['USA', 'India', 'USA', 'USA', 'USA', 'USA'])
updatedf_country('2012ApJ...761L..32W',['Taiwan', 'USA', 'Taiwan'])
updatedf_country('2013ApJ...778..179W',['USA', 'USA', 'USA', 'UK', 'UK', 'UK', 'Germany', 'UK', 'USA', 'UK', 'UK', 'Germany', 'USA', 'China', 'UK', 'UK', 'Austria', 'Germany', 'Germany', 'Netherlands'])
updatedf_country('2013A&A...557A.132M',['Netherlands', 'Netherlands', 'Netherlands', 'Netherlands', 'Taiwan', 'Netherlands', 'Japan', 'Germany', 'Chile', 'Netherlands', 'Netherlands', 'USA', 'Chile', 'Germany'])
updatedf_country('2015ApJ...799..194C',['UK', 'UK', 'UK', 'UK', 'UK', 'UK', 'Germany', 'USA', 'Canada', 'UK', 'UK', 'Austria', 'UK', 'UK', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Denmark', 'Germany', 'Netherlands'])


In [208]:
df_xml.to_pickle('./df_telbib_alma_aff_contries.pkl')
df_xml = pd.read_pickle('./df_telbib_alma_aff_contries.pkl')

In [209]:
appendix_df

Unnamed: 0,0
-,-
Belgium,EU
Bulgaria,EU
Cyprus,Europa+Africa
Greece,Europa+Africa
...,...
Armenia,Oceania+Asia
Crimea,Oceania+Asia
Georgia,Oceania+Asia
Kazakhstan,Oceania+Asia


In [149]:
df_xml.iloc[1456]

authors         :Li, Jianan:Wang, Ran:Riechers, Dominik:Walter...
year                                                         2020
bibcode                                       2020ApJ...889..162L
journal                                                       ApJ
programids                                         2015.1.01265.S
affiliations    [Department of Astronomy, School of Physics, P...
countries       [China, Corresponding author, USA, Germany, It...
Name: 1456, dtype: object