## Load Library

In [11]:
import pycurl
from io import BytesIO
import bs4
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from pandas import DataFrame
import pandas as pd

## Load Functions

In [4]:
def search(url):
    buffer = BytesIO()
    storage = BytesIO()
    c = pycurl.Curl()
    c.setopt(c.URL, url)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    c.close()
    body = buffer.getvalue()
    soup = BeautifulSoup(body, 'html.parser')
    return soup

def get_author_id(term):
    term_up = term.upper()
    term_ref = term_up.replace(' ', '+')
    
    url = 'http://sinta2.ristekdikti.go.id/affiliations/detail?q='+ term_ref +'&search=1&view=authors&id=384'
    author_search = search(url)
    
    for a in author_search.find_all('a', href=True):
        if term_up in a:
            url_id = (a['href'])
            auth_id = url_id.replace('/authors/detail/?id=', "")
            auth_idx = auth_id.replace('&view=overview', "")
    return auth_idx

def get_author_rank(auth_id1):
    url2 = 'http://sinta2.ristekdikti.go.id/authors/detail?id='+auth_id1+'&view=overview'
    info = search(url2)

    for a in info.find_all('div', class_='stat-lbl-big'):
        for i in a.contents:
            if "Rank in National" in i:
                x = a.find_previous('div', class_="stat-num")
                print('Rank in National '+x.contents[0])
            if "Rank in Affiliation" in i:
                x = a.find_previous('div', class_="stat-num")
                print('Rank in Affiliation '+x.contents[0])
            if " Score" in i:
                x = a.find_previous('div', class_="stat-num")
                print('SINTA Score '+x.contents[0])
            
    for a in info.find_all('div', class_='uk-width-2-10 uk-row-first stat-lbl-pub'):
        for i in a.contents:
            if "scopus" in i['src']:
                x = a.find_next('div', class_="uk-width-2-10 stat-num-pub")
                print('Scopus '+x.contents[0])
            if "scholar" in i['src']:
                x = a.find_next('div', class_="uk-width-2-10 stat-num-pub")
                print('Google Scholar '+x.contents[0])
    return

def get_author_info(term):
    auth_id2 = get_author_id(term)
    
    url2 = 'http://sinta2.ristekdikti.go.id/authors/detail?id='+auth_id2+'&view=overview'
    info = search(url2)
    
    for a in info.find_all('div', class_='stat-lbl-big'):
        for i in a.contents:
            if "Rank in National" in i:
                nr = a.find_previous('div', class_="stat-num")
                #print('Rank in National '+n_r.contents[0])
                n_r = nr.contents[0]
            if "Rank in Affiliation" in i:
                ar = a.find_previous('div', class_="stat-num")
                #print('Rank in Affiliation '+a_r.contents[0])
                a_r = ar.contents[0]
            if " Score" in i:
                ss = a.find_previous('div', class_="stat-num")
                #print('SINTA Score '+x.contents[0])
                s_s = ss.contents[0]
            
    for a in info.find_all('div', class_='uk-width-2-10 uk-row-first stat-lbl-pub'):
        for i in a.contents:
            if "scopus" in i['src']:
                s_p = a.find_next('div', class_="uk-width-2-10 stat-num-pub")
                #print('Scopus '+x.contents[0])
                sp = s_p.contents[0]
            if "scholar" in i['src']:
                g_s = a.find_next('div', class_="uk-width-2-10 stat-num-pub")
                #print('Google Scholar '+x.contents[0])
                gs = g_s.contents[0]
    df_data = [term, auth_id2, n_r, a_r, s_s, sp, gs]
    return df_data

## Test

In [5]:
#Get SINTA ID
term = 'Matin Nuhamunada'
author_id2 = get_author_id(term)
author_id2

'6014421'

In [8]:
#Get HTML page of author's information based on SINTA ID
url2 = 'http://sinta2.ristekdikti.go.id/authors/detail?id='+author_id2+'&view=overview'
info = search(url2)
#info

In [12]:
#Create Empty Dataframe to contain data
df_data = DataFrame(columns=('Nama',"SINTA ID", "Rank National", "Rank Affiliation", "Sinta Score", "Scopus", "Scholar"))
df_data.index.name = 'No'

In [13]:
#Test for multiple author search
biologi = {'Nama' : ["Matin Nuhamunada", "Indra Lesmana"]}
df_biologi = DataFrame(data=biologi)
df_biologi.index.name = 'No'
for i in tqdm_notebook(range(len(df_biologi))):
    df_data.loc[i] = get_author_info(df_biologi.loc[i, "Nama"])
df_data

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




Unnamed: 0_level_0,Nama,SINTA ID,Rank National,Rank Affiliation,Sinta Score,Scopus,Scholar
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Matin Nuhamunada,6014421,11023,848,1.37,3,5
1,Indra Lesmana,6023357,22802,1205,0.29,2,3


# Data Mining of Lecturers from the Faculty of Biology UGM

In [20]:
lecturer = search('http://biologi.ugm.ac.id/en/lectures/')
#lecturer

In [66]:
lec_list = []
for i in lecturer.find_all('div', class_='elementor-tab-content elementor-clearfix'):
    for a in i.contents:
        for x in a.contents:
            if isinstance(x, NavigableString):
                pass
            else:
                #print(x.find_all('strong'))
                for b in x.find_all('strong'):#, href=True):
                    for c in b.contents:
                        if not isinstance(c, bs4.element.NavigableString):
                            if isinstance(c.contents[0], bs4.element.NavigableString):
                                d = c.contents[0]
                                lec_list.append(str(d))

In [178]:
lec_list = []
for i in lecturer.find_all('div', class_='elementor-tab-content elementor-clearfix'):
    for a in i.contents:
        for x in a.contents:
            if isinstance(x, NavigableString):
                pass
            else:
                #print(x.find_all('strong'))
                for b in x.find_all('strong'):#, href=True):
                    for c in b.contents:
                        if isinstance(c, bs4.element.NavigableString):
                            #print(c)
                            d = c
                            lec_list.append(str(d))
                        else:
                            if not isinstance(c, bs4.element.NavigableString):
                                if isinstance(c.contents[0], bs4.element.NavigableString):
                                    #print(c.contents[0])
                                    d = c.contents[0]
                                    lec_list.append(str(d))
#lec_list                                       

from bs4 import NavigableString
lec_list = []
for i in lecturer.find_all('div', class_='elementor-tab-content elementor-clearfix'):
    for a in i.contents:
        for x in a.contents:
            if isinstance(x, NavigableString):
                pass
            else:
                for b in x.find_all('a'):#, href=True):
                    if not b.find("strong") is None:
                        c = b.find("strong")
                        d = c.contents[0]
                        lec_list.append(str(d))
lec_list

In [162]:
import re
replaced = [w.replace('Prof. Dr.\xa0', '')\
            .replace('Prof. Dr. ', '')\
            .replace('Prof. Dra. ', '')\
            .replace('Dr.rer.nat. ', '')\
            .replace('Dr. rer.nat. ', '')\
            .replace('Dr.biol.hom. ', '')\
            .replace('Dr.med.vet. ', '')\
            .replace('Dra. ', '')\
            .replace('Drs. ', '')\
            .replace('Dr. ', '')\
            .replace('Dr.\xa0', '')\
            .replace('S.S., ', '')\
            .replace(', ', '')\
            .replace('M.Sc.', '')\
            .replace('Ph.D.', '')\
            .replace('M.Si.', '')\
            .replace('M.Si', '')\
            .replace('M.S.', '')\
            .replace('M.App.Sc.', '')\
            .replace('S.Si. ', '')\
            .replace('S.Si.', '')\
            .replace('S.U.', '')\
            .replace('M.Biotech.', '')\
            .replace(' M.Env.Sc', '')\
            .replace('M.Agr.Sc.', '')\
            .replace('M. Eng.', '')\
            .replace('M.Agr.', '')\
            .replace('M.Kes.', '')\
            .replace('M.P.', '')\
            .replace('DAP', '')\
            .replace('MS.', '')\
            .replace('St.', '')\
            .replace('\xa0', '')\
            for w in lec_list]
#replaced

In [163]:
replaced2 = [w.replace('Niken Satuti Handayani', 'Niken Satuti Nur Handayani') \
             .replace('Ganies Riza A.', 'Ganies Riza Aristya') \
             .replace('L. Hartanto Nugroho', 'Laurentius Hartanto Nugroho') \
             .replace('Diah Rahmawati', 'Diah Rachmawati') \
             .replace('Rr. Upiek Ngesti W.A.', 'Rr Upiek Ngesti Wibawaning A') \
             .replace('Soenarwan Heri Poerwanto', 'Soenarwan Hery Poerwanto') \
             .replace('Dila Hening Windyarini', 'Dila Hening Windyaraini') \
             .replace('Susilo Hadi', 'Susilohadi') \
             .replace('Hendry T.S.S.G. Saragih', 'Hendry TSSG Saragih') \
             .replace('R.C. Hidayat Soesilohadi', 'R C Hidayat Soesilohadi') \
             .replace('Mulyati Sarto', 'Mulyati') \
             .replace('Slamet Widianto', 'Slamet Widiyanto') \
             .replace('Agnes Endang Sutariningsih', 'Agnes Endang Sutariningsih S') \
             .replace('.', '')\
             .replace(',', '')\
             for w in replaced]

In [171]:
biologi2 = {'Nama' : replaced2}
df_biologi2 = DataFrame(data=biologi2)
df_biologi2.index.name = 'No'
df_biologi3 = df_biologi2.drop([27])
df_biologi3.index = pd.RangeIndex(len(df_biologi3.index))
#df_biologi3

In [165]:
import os, sys
from time import sleep
if not os.path.isfile('data2.csv'):
    df_data2 = DataFrame(columns=('Nama',"SINTA ID", "Rank National", "Rank Affiliation", "Sinta Score", "Scopus", "Scholar"))
    df_data2.index.name = 'No'
else:
    df_data2 = pd.read_csv('data2.csv', index_col=0)
pbar = tqdm_notebook(range(len(df_biologi3))) #to make progressbar    
for i in pbar:
    if not i in df_data2.index:
        try:
            df_data2.loc[i] = get_author_info(df_biologi3.loc[i, "Nama"])
            pbar.set_description('processed: %d' % (i))
            pbar.update(1)
            sleep(1)
        except:
            print('error '+df_biologi3.loc[i, "Nama"])
            pass

HBox(children=(IntProgress(value=0, max=68), HTML(value='')))

In [176]:
#df_data2

In [167]:
df_data2.to_csv('data2.csv')

In [181]:
df_data3 = pd.read_csv('data2.csv', index_col=0)
rank_bio = df_data3.sort_values(by=['Rank National'], ascending=True)
rank_bio.index = pd.RangeIndex(len(rank_bio.index))+1
pd.options.display.max_rows = 70
rank_bio

Unnamed: 0,Nama,SINTA ID,Rank National,Rank Affiliation,Sinta Score,Scopus,Scholar
1,Endang Semiarti,5973460,121,10,47.51,24,164
2,Nastiti Wijayanti,5972899,400,50,26.54,20,79
3,Yekti Asih Purwestri,21472,969,116,15.39,6,34
4,Budi Setiadi Daryono,21420,1153,136,13.59,32,296
5,Laurentius Hartanto Nugroho,5977709,1332,159,12.26,16,72
6,Rarastoeti Pratiwi,257321,2679,299,7.66,8,91
7,Maryani,6016534,2719,304,7.57,4,13
8,Eko Agus Suyono,42920,3377,367,6.2,17,50
9,Diah Rachmawati,6014449,3530,382,5.93,7,58
10,Woro Anindito Sri Tunjung,6021239,4374,453,4.73,8,38


## Data Mining UB

In [None]:
#Get HTML page of author's information based on SINTA ID
url2 = 'http://sinta2.ristekdikti.go.id/authors/detail?id='+author_id2+'&view=overview'
info = search(url2)
#info