In [1]:
import re
import bs4
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import sinta_scraper as sinta

In [2]:
def author_extractor(soup):
    result = []
    for num, i in enumerate(soup.select(".text-blue")): #get first item
        #get name and sinta id
        name = i.text.strip()
        author_id = re.findall(r'\d+', i['href'])[0]
        # get affiliation and NIDN
        for x in i.find_next('dd'):
            affil_name = x.contents[0] 
            NIDN = x.find_next('dd').contents[2].replace(' : ', '')
        # Get expertise
        areas = [] # empty container
        expertise1 = i.find_next('a', {'class':"area-item-small"}) # get first expertise
        ctr = 0 # helper counter
        while ctr < 3 and isinstance(expertise1, bs4.element.Tag): # loop until NoneType & max 3 expertise
            areas.append(expertise1.contents[0])
            expertise1 = expertise1.find_next('a') # get next expertise
            try:
                if expertise1['class'][0] == 'area-item-small': # if expertise feature, continue
                    ctr = ctr + 1
                else: # if not expertise feature pass
                    ctr = 3
            except KeyError:
                ctr = 3
        
        result_data = {'id': author_id,
                       'name': name,
                       'NIDN' : NIDN,
                       'affiliation': affil_name,
                       'areas': areas 
                       }
        result.append(result_data)
    return result

def search_author(term):
    term = term.replace(' ','+') #format search for url
    p_ctr = 1 # init page
    
    # load html from first page
    url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    
    # calculate how many pages
    page = soup.find('caption').contents
    page = [int(s) for s in page[0].split() if s.isdigit()]
    p_max = page[1]
    
    # get result from first page
    result = []
    data = author_extractor(soup)
    [result.append(dic) for dic in data]
    
    if p_max == 1:
        pass
    else:
        while p_ctr < (p_max + 1):
            p_ctr = p_ctr + 1
            #print(f'{term}: retrieving page {p_ctr} of {p_max}')
            url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
            #print(url)
            html = get(url)
            soup = BeautifulSoup(html.content, 'html.parser')
            data = author_extractor(soup)
            [result.append(dic) for dic in data]
    return result

In [3]:
# get authors of from biologi department S1, S2, S3
dept_id = ['46201', '46101', '46001']
affil_id = '384'
ugm_bio_SINTA = pd.DataFrame()
for de in dept_id:
    data = sinta.dept_authors(de, affil_id)
    data = pd.DataFrame(data)
    data.loc[:, 'department'] = de
    ugm_bio_SINTA = ugm_bio_SINTA.append(data)

# Load data in manual
ugm_bio_MANUAL = pd.read_csv('../data/UGM/data_NIDN-versi-sinta.txt', index_col=0)

# Merge both in Manual and in SINTA
ugm_bio_MERGE = ugm_bio_MANUAL.merge(ugm_bio_SINTA, left_on='name', right_on='name')

# Not listed yet in SINTA
mask = ugm_bio_MANUAL['name'].isin(ugm_bio_SINTA.name)
ugm_bio_not_listed = ugm_bio_MANUAL[~mask]

# Append all data together
ugm_bio_ALL = ugm_bio_MERGE.append(ugm_bio_not_listed)

# Filter retired faculty member
ugm_bio_RETIRED = pd.read_csv('../data/UGM/ugm_bio_retired.txt')
mask = ugm_bio_ALL.name.isin(ugm_bio_RETIRED.Retired)
ugm_bio_ALL = ugm_bio_ALL[~mask]
ugm_bio_ALL

Unnamed: 0,name,NIDN,id,department
0,Tri Rini Nuringtyas,0027037304,21396,46101
1,Woro Anindito Sri Tunjung,0520077903,6021239,46201
2,Lisna Hidayati,0021038604,6002859,46201
4,Aries Bagus Sasongko,0511038303,6017558,46201
5,Suwarno Hadisusanto,0016115402,6017929,46201
...,...,...,...,...
26,Arief Muammar,1120170055,,
46,Donan Satria Yudha,05 261080 01,,
49,Dila Hening Windyaraini,1120160047,,
56,Luthfi Nurhidayat,0014118703,,


In [4]:
# Missing SINTA ID
missing_SINTAID = ugm_bio_ALL[pd.isnull(ugm_bio_ALL.id)]
nama = missing_SINTAID.name
output_missing_SINTAID = pd.DataFrame()
for i in nama:
    data = search_author(i)
    data = pd.DataFrame(data)
    data = data[data.affiliation == 'Universitas Gadjah Mada']
    output_missing_SINTAID = output_missing_SINTAID.append(data)
output_missing_SINTAID

Unnamed: 0,id,name,NIDN,affiliation,areas
0,257321,RARASTOETI PRATIWI,0025106305,Universitas Gadjah Mada,"[Biochemistry, Molecular Biology, Immunology]"
0,21472,YEKTI ASIH PURWESTRI,0023057104,Universitas Gadjah Mada,[]
0,5973460,ENDANG SEMIARTI,00231162o4,Universitas Gadjah Mada,[]
0,42920,EKO AGUS SUYONO,0018127101,Universitas Gadjah Mada,"[Algal Culture, Biotechnology & Engineering]"
0,6014421,MATIN NUHAMUNADA,1120170053,Universitas Gadjah Mada,"[Biotechnology, Systems & Synthetic Biology, M..."
0,6014471,MUKHLISH JAMAL MUSA HOLLE,1120170105,Universitas Gadjah Mada,[Ecology]
1,6023357,INDRA LESMANA,1120170054,Universitas Gadjah Mada,[Molecular Genetics]
1,6017987,ARIEF MUAMMAR,1120170055,Universitas Gadjah Mada,[]
0,5981593,DONAN SATRIA YUDHA,05 261080 01,Universitas Gadjah Mada,[]
0,6014567,DILA HENING WINDYARAINI,1120160047,Universitas Gadjah Mada,"[Parasitology, Helminthology, Protozoology]"


In [5]:
# Missing NIDN
missing_NIDN = ugm_bio_ALL[pd.isnull(ugm_bio_ALL.NIDN)]
nama = missing_NIDN.name
output_missing_NIDN = pd.DataFrame()
for i in nama:
    #print(i)
    data = search_author(i)
    data = pd.DataFrame(data)
    data = data[data.affiliation == 'Universitas Gadjah Mada']
    output_missing_NIDN = output_missing_NIDN.append(data)
output_missing_NIDN

Unnamed: 0,id,name,NIDN,affiliation,areas
0,6200486,AKBAR REZA,10049103,Universitas Gadjah Mada,"[Coral Reef Ecology, Seascape Ecology, Marine ..."
0,6721418,NOVITA YUSTINADIAR,29038702,Universitas Gadjah Mada,"[Plant Anatomy, Plant Structure and Developmen..."
0,6690307,ANNAS RABBANI,29088904,Universitas Gadjah Mada,[Plant Systematic]
0,6721424,NUR INDAH SEPTRIANI,12098904,Universitas Gadjah Mada,"[Animal Biology, Animal Development, Fisheries..."
0,6751699,SIDIQ PERMANA PUTRA,13089201,Universitas Gadjah Mada,[]
10,6042638,UTAMININGSIH,18048708,Universitas Gadjah Mada,[Plant structure and development]
0,6760087,WAHYU ARISTYANING PUTRI,28088807,Universitas Gadjah Mada,"[biotechnology, microbiology, virology]"
0,6760508,DWI SENDI PRIYONO,30099206,Universitas Gadjah Mada,"[Molecular Ecology, Evolution, Genetic Conserv..."


In [6]:
df1 = ugm_bio_ALL.set_index('id', drop=False)
df2 = output_missing_NIDN.set_index('id', drop=False)

for _id in df2.index:
    #print(df1.loc[_id, :], _id)
    df1.loc[_id, 'NIDN'] = df2.loc[_id, 'NIDN'] 
    
df1 = df1.set_index('NIDN', drop=False)
df2 = output_missing_SINTAID.set_index('NIDN', drop=False)

for _id in df2.index:
    df1.loc[_id, 'id'] = df2.loc[_id, 'id']

df1 = df1.reset_index(drop = True)
df1.NIDN = [i.replace(' ','') for i in df1.NIDN]
df1.to_csv('../tables/ugm_bio_clean.csv')

In [9]:
df = pd.read_csv('../tables/ugm_bio_clean.csv', index_col=0)
#sinta_get = sinta.authors(df.id)
#df2 = pd.DataFrame(sinta_get)
#df2.to_csv('../tables/ugm_bio_clean_sinta.csv')
df2 = pd.read_csv('../tables/ugm_bio_clean_sinta.csv', index_col=0)
df = df.merge(df2, left_on='id', right_on='id')#2.set_index('id', drop=False)
df['Instansi'] = 'Universitas Gadjah Mada'
df.to_csv('../tables/ugm_bio_ready.csv')