In [1]:
import re
import bs4
from bs4 import BeautifulSoup
from requests import get

import pandas as pd
import sinta_scraper as sinta

In [2]:
def author_extractor(soup):
    result = []
    for num, i in enumerate(soup.select(".text-blue")): #get first item
        #get name and sinta id
        name = i.text.strip()
        author_id = re.findall(r'\d+', i['href'])[0]
        # get affiliation and NIDN
        for x in i.find_next('dd'):
            affil_name = x.contents[0] 
            NIDN = x.find_next('dd').contents[2].replace(' : ', '')
        # Get expertise
        areas = [] # empty container
        expertise1 = i.find_next('a', {'class':"area-item-small"}) # get first expertise
        ctr = 0 # helper counter
        while ctr < 3 and isinstance(expertise1, bs4.element.Tag): # loop until NoneType & max 3 expertise
            areas.append(expertise1.contents[0])
            expertise1 = expertise1.find_next('a') # get next expertise
            try:
                if expertise1['class'][0] == 'area-item-small': # if expertise feature, continue
                    ctr = ctr + 1
                else: # if not expertise feature pass
                    ctr = 3
            except KeyError:
                ctr = 3
        
        result_data = {'id': author_id,
                       'name': name,
                       'NIDN' : NIDN,
                       'affiliation': affil_name,
                       'areas': areas 
                       }
        result.append(result_data)
    return result

def search_author(term):
    term = term.replace(' ','+') #format search for url
    p_ctr = 1 # init page
    
    # load html from first page
    url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
    html = get(url)
    soup = BeautifulSoup(html.content, 'html.parser')
    
    # calculate how many pages
    page = soup.find('caption').contents
    page = [int(s) for s in page[0].split() if s.isdigit()]
    p_max = page[1]
    
    # get result from first page
    result = []
    data = author_extractor(soup)
    [result.append(dic) for dic in data]
    
    if p_max == 1:
        pass
    else:
        while p_ctr < (p_max + 1):
            p_ctr = p_ctr + 1
            print(f'{term}: retrieving page {p_ctr} of {p_max}')
            url = f'https://sinta.ristekbrin.go.id/authors?page={p_ctr}&q={term}&search=2&ag=&sort=year2&view='
            #print(url)
            html = get(url)
            soup = BeautifulSoup(html.content, 'html.parser')
            data = author_extractor(soup)
            [result.append(dic) for dic in data]
    return result

In [3]:
# get authors of from biologi department S1, S2, S3
dept_id = ['46201', '46101', '46001', '46202', '46203', '54107']
affil_id = '379'
SINTA = pd.DataFrame()
for de in dept_id:
    data = sinta.dept_authors(de, affil_id)
    data = pd.DataFrame(data)
    data.loc[:, 'department'] = de
    SINTA = SINTA.append(data)

# Load data in manual
MANUAL = pd.read_csv('../data/ITB/data_NIDN-versi-sinta.txt', index_col=0, dtype=str)

# Merge both in Manual and in SINTA
MERGE = MANUAL.merge(SINTA, left_on='name', right_on='name')

# Not listed yet in SINTA
mask = MANUAL['name'].isin(SINTA.name)
not_listed = MANUAL[~mask]

# Append all data together
ALL = MERGE.append(not_listed)

# Filter retired faculty member
RETIRED = pd.read_csv('../data/ITB/itb_bio_retired.txt')
mask = ALL.name.isin(RETIRED.Retired)
ALL = ALL[~mask]
#ALL

ALL.loc[ALL[ALL.id.isnull()].index, 'id'] = ALL.loc[ALL[ALL.id.isnull()].index, 'id_manual']
ALL = ALL.drop(columns=['id_manual'])
ALL = ALL.sort_values(by=['name']).reset_index(drop=True)
ALL.to_csv('../data/ITB/all.csv')
ALL

Unnamed: 0,name,NIDN,id,department
0,Adi Pancoro,19055904,6034881,46201
1,Aep Supriyadi,9016101,6035303,
2,Agus Dana Permana,22085804,5990755,46201
3,Agustania,11085908,6035644,
4,Ahim Ruswandi,3067006,6035348,
...,...,...,...,...
72,Wawan Gunawan,0407047505,6665397,
73,Yayat Hidayat,12046802,20238,
74,Yeyet Setiawati,17066708,6036269,
75,Yooce Yustiana,7036712,6042267,


In [4]:
# Missing SINTA ID
missing_SINTAID = ALL[pd.isnull(ALL.id)]
nama = missing_SINTAID.name
output_missing_SINTAID = pd.DataFrame()
for i in nama:
    print(i)
    data = search_author(i)
    data = pd.DataFrame(data)
    data = data[data.affiliation == 'Institut Teknologi Bandung']
    output_missing_SINTAID = output_missing_SINTAID.append(data)
output_missing_SINTAID

In [5]:
# Missing NIDN
missing_NIDN = ALL[pd.isnull(ALL.NIDN)]
nama = missing_NIDN.name
output_missing_NIDN = pd.DataFrame()
for i in nama:
    print(i)
    data = search_author(i)
    data = pd.DataFrame(data)
    data = data[data.affiliation == 'Institut Teknologi Bandung']
    output_missing_NIDN = output_missing_NIDN.append(data)
output_missing_NIDN

In [6]:
df1 = ALL.set_index('id', drop=False)
'''df2 = output_missing_NIDN.set_index('id', drop=False)

for _id in df2.index:
    #print(df1.loc[_id, :], _id)
    df1.loc[_id, 'NIDN'] = df2.loc[_id, 'NIDN'] 
    
df1 = df1.set_index('NIDN', drop=False)
df2 = output_missing_SINTAID.set_index('NIDN', drop=False)

for _id in df2.index:
    df1.loc[_id, 'id'] = df2.loc[_id, 'id']


#df1.NIDN = [i.replace(' ','') for i in df1.NIDN]'''
df1 = df1.reset_index(drop = True)
df1.to_csv('../tables/itb_bio_clean.csv')

In [7]:
df = pd.read_csv('../tables/itb_bio_clean.csv', index_col=0, dtype=str)
#sinta_get = sinta.authors(df.id)
#df2 = pd.DataFrame(sinta_get)
#df2.to_csv('../tables/itb_bio_clean_sinta.csv')
df2 = pd.read_csv('../tables/itb_bio_clean_sinta.csv', index_col=0, dtype=str)
df = df.merge(df2, left_on='id', right_on='id')#2.set_index('id', drop=False)
df['Instansi'] = 'Institut Teknologi Bandung'
df.to_csv('../tables/itb_bio_ready.csv')