In [7]:
import re
import bs4
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import sinta_scraper as sinta

import numpy as np

In [8]:
def page_extractor(soup):
    result = []
    for num, i in enumerate(soup.select(".text-blue")): #get first item
        #get name and sinta id
        name = i.text.strip()
        author_id = re.findall(r'\d+', i['href'])[0]
        # get affiliation and NIDN
        for x in i.find_next('dd'):
            affil_name = x.contents[0] 
            NIDN = x.find_next('dd').contents[2].replace(' : ', '')
        # Get expertise
        areas = [] # empty container
        expertise1 = i.find_next('a', {'class':"area-item-small"}) # get first expertise
        ctr = 0 # helper counter
        while ctr < 3 and isinstance(expertise1, bs4.element.Tag): # loop until NoneType & max 3 expertise
            areas.append(expertise1.contents[0])
            expertise1 = expertise1.find_next('a') # get next expertise
            try:
                if expertise1['class'][0] == 'area-item-small': # if expertise feature, continue
                    ctr = ctr + 1
                else: # if not expertise feature pass
                    ctr = 3
            except KeyError:
                ctr = 3
        
        result_data = {'id': author_id,
                       'name': name,
                       'NIDN' : NIDN,
                       'affiliation': affil_name,
                       'areas': areas 
                       }
        result.append(result_data)
    return result

def search_author(term):
    term = term.replace(' ','+') #format search for url
    p_ctr = 1 # init page
    
    # load html from first page
    url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    
    # calculate how many pages
    page = soup.find('caption').contents
    page = [int(s) for s in page[0].split() if s.isdigit()]
    p_max = page[1]
    
    # get result from first page
    result = []
    data = page_extractor(soup)
    [result.append(dic) for dic in data]
    
    if p_max == 1:
        pass
    else:
        while p_ctr < (p_max + 1):
            p_ctr = p_ctr + 1
            print(f'{term}: retrieving page {p_ctr-1} of {p_max}')
            url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
            #print(url)
            html = get(url)
            soup = BeautifulSoup(html.content, 'html.parser')
            data = page_extractor(soup)
            [result.append(dic) for dic in data]
    return result

In [11]:
# get authors of from biologi department S1, S2, S3
affil_name = 'ipb'
affil_name_long = 'Institut Pertanian Bogor'
affil_id = '428'
dept_id = [46002, 54107, 54113, 54007, 46102, 46004, 47101, 54153, 47202, 46104, 54208, 46103, 46101, 54132, 54053, 46003, 46201, 54032]

SINTA = pd.DataFrame()
for de in dept_id:
    data = sinta.dept_authors(de, affil_id)
    data = pd.DataFrame(data)
    #print(de, data)
    data.loc[:, 'department'] = de
    SINTA = SINTA.append(data)
SINTA['NIDN'] = np.nan
ALL = SINTA

# Load data in manual
#MANUAL = pd.read_csv('../data/ITB/data_NIDN-versi-sinta.txt', index_col=0, dtype=str)

# Merge both in Manual and in SINTA
#MERGE = MANUAL.merge(SINTA, left_on='name', right_on='name')

# Not listed yet in SINTA
#mask = MANUAL['name'].isin(SINTA.name)
#not_listed = MANUAL[~mask]

# Append all data together
#ALL = MERGE.append(not_listed)

# Filter retired faculty member
RETIRED = pd.read_csv(f'../data/{affil_name.upper()}/{affil_name}_bio_retired.txt')
mask = ALL.name.isin(RETIRED.Retired)
ALL = ALL[~mask]
#ALL

#ALL.loc[ALL[ALL.id.isnull()].index, 'id'] = ALL.loc[ALL[ALL.id.isnull()].index, 'id_manual']
#ALL = ALL.drop(columns=['id_manual'])


ALL = ALL.sort_values(by=['name']).reset_index(drop=True)
ALL.to_csv(f'../data/{affil_name.upper()}/all.csv')
ALL

Unnamed: 0,id,name,department,NIDN
0,6186794,Abdul Haris Mustari,54153,
1,5996964,Achmad Farajallah,46002,
2,6000226,Agus Ghautsun Niam,54208,
3,6038006,Agus Hikmat,54153,
4,5985693,Agus Priyono Kartono,54053,
...,...,...,...,...
112,6165395,Yohana Caecilia Sulistyaningsih,46103,
113,36148,Yohanes Aris Purwanto,54208,
114,5988733,Yudiwanti Wahyu Endro Kusumo,54007,
115,5976282,Yuliana Maria Diah Ratnadewi,46003,


In [12]:
# Missing NIDN
missing_NIDN = ALL[pd.isnull(ALL.NIDN)]
nama = missing_NIDN.name
output_missing_NIDN = pd.DataFrame()
for i in nama:
    print(i)
    data = search_author(i)
    data = pd.DataFrame(data)
    data = data[data.affiliation == affil_name_long]
    output_missing_NIDN = output_missing_NIDN.append(data)
output_missing_NIDN

Abdul Haris Mustari
Achmad Farajallah
Agus Ghautsun Niam
Agus Hikmat
Agus Priyono Kartono
Agus Sutejo
Akhmad Endang Zainal Hasan
Amrozi
Ani Mardiastuti
Anja Meryandini
Antonius Suwanto
Aris Tjahjoleksono
Aris Tri Wahyudi
Awang Maharijaya
Bambang Purwantara
Bambang Suryobroto
Berry Juliandi
Burhanuddin Masyud
Darda Efendi
Dede Aulia Rahman
Dede Setiadi
Dedy Duryadi Solihin
Desrial
Desta Wirnas
Dewi Sukma
Didy Sopandie
Dimas Andrianto
Dinar Tri Soelistyowati
Djarot Sasongko Hami Seno
Dorly
Dwi Andreas Santosa
Edy Hartulistiyoso
Gatot Pramuhadi
Gayuh Rahayu
Gunawan Djajakirana
Hadisunarso
Hajrial Aswidinnoor
Hamim
Hamim: retrieving page 1 of 5
Hamim: retrieving page 2 of 5
Hamim: retrieving page 3 of 5
Hamim: retrieving page 4 of 5
Hamim: retrieving page 5 of 5
Hasim
Hasim: retrieving page 1 of 4
Hasim: retrieving page 2 of 4
Hasim: retrieving page 3 of 4
Hasim: retrieving page 4 of 4
Hilda Akmal
Hirmas Fuady Putra
Husnawati
I Made Artika
Ibnul Qayim
Iman Rusmana
Iman Supriatna
Ivan Perma

Unnamed: 0,id,name,NIDN,affiliation,areas
0,6186794,ABDUL HARIS MUSTARI,0015106508,Institut Pertanian Bogor,[Ekologi Satwaliar]
0,5996964,ACHMAD FARAJALLAH,0027046503,Institut Pertanian Bogor,[Zoology]
0,6000226,AGUS GHAUTSUN NIAM,0011068206,Institut Pertanian Bogor,[Biosystem engineering]
0,6038006,AGUS HIKMAT,0018096208,Institut Pertanian Bogor,"[Plant Conservation, Plant Ecology, Ethnobotany]"
0,5985693,AGUS PRIYONO KARTONO,0021026609,Institut Pertanian Bogor,"[Biodiversity Conservation, Mammals Ecology, B..."
...,...,...,...,...,...
0,6165395,YOHANA CAECILIA SULISTYANINGSIH,0011056209,Institut Pertanian Bogor,"[Medicinal plant, Plant structure and histoche..."
0,36148,YOHANES ARIS PURWANTO,0007036406,Institut Pertanian Bogor,"[food engineering, postharvest engineering, re..."
0,5988733,YUDIWANTI WAHYU ENDRO KUSUMO,0007116311,Institut Pertanian Bogor,[Pant Breeding]
0,5976282,YULIANA MARIA DIAH RATNADEWI,0026035706,Institut Pertanian Bogor,[secondary metabolism in in vitro culture]


In [13]:
df1 = ALL.set_index('id', drop=False)
df2 = output_missing_NIDN.set_index('id', drop=False)

for _id in df2.index:
    #print(df1.loc[_id, :], _id)
    df1.loc[_id, 'NIDN'] = df2.loc[_id, 'NIDN'] 
    
'''df1 = df1.set_index('NIDN', drop=False)
df2 = output_missing_SINTAID.set_index('NIDN', drop=False)

for _id in df2.index:
    df1.loc[_id, 'id'] = df2.loc[_id, 'id']

'''
#df1.NIDN = [i.replace(' ','') for i in df1.NIDN]'''
df1 = df1[~df1.name.isnull()]
df1 = df1.reset_index(drop = True)
df1.to_csv(f'../tables/{affil_name}_bio_clean.csv')

In [14]:
df = pd.read_csv(f'../tables/{affil_name}_bio_clean.csv', index_col=0, dtype=str)
sinta_get = sinta.authors(df.id)
df2 = pd.DataFrame(sinta_get)
df2.to_csv(f'../tables/{affil_name}_bio_clean_sinta.csv')
df2 = pd.read_csv(f'../tables/{affil_name}_bio_clean_sinta.csv', index_col=0, dtype=str)
df = df.merge(df2, left_on='id', right_on='id')#2.set_index('id', drop=False)
df['Instansi'] = affil_name_long
df.to_csv(f'../tables/{affil_name}_bio_ready.csv')