In [1]:
import pandas as pd
import sinta_scraper as sinta
import numpy as np

# Import search_author module as it is not yet available in sinta_scraper
import sys
import os
module_path = os.path.abspath(os.path.join('../scripts/'))
if module_path not in sys.path:
    sys.path.append(module_path)
from author_search import search_author

import ast

In [2]:
def sinta_life_sciences(univ):
    # get life sciences author
    ## Set variables
    print(f'**Gathering information for {univ.upper()}...')
    
    with open(f"../data/{univ.upper()}/{univ}_affil_information.txt", "r", encoding = 'utf-8') as f:
        d = ast.literal_eval(f.read())

    affil_name = d['affil_name']
    affil_name_long = d['affil_name_long']
    affil_id = d['affil_id']
    dept_id = d['dept_id']
    
    print(d)
    
    # get authors by dept_id
    print(f'**Gathering authors ids...')
    SINTA = pd.DataFrame()
    for de in dept_id:
        data = sinta.dept_authors(de, affil_id)
        data = pd.DataFrame(data)
        #print(de, data)
        data.loc[:, 'department'] = de
        SINTA = SINTA.append(data)
    #SINTA.to_csv(f'../tables/{affil_name}_bio_SINTA.csv')
    
    # Load data in manual
    try:
        #try if there is manual!!!! FileNotFoundError: [Errno 2] No such file or directory: '../data/UB/ub_data_NIDN-versi-sinta.txt'
        MANUAL = pd.read_csv(f'../data/{affil_name.upper()}/{affil_name}_data_NIDN.txt', index_col=0, dtype=str)

        # Merge Manual and SINTA
        MERGE = pd.merge(MANUAL, SINTA, on=["name", "id"], how='outer')
    except FileNotFoundError:
        MERGE = SINTA
    
    # Filter retired faculty member
    try:
        RETIRED = pd.read_csv(f'../data/{affil_name.upper()}/{affil_name}_bio_retired.txt')
        mask = MERGE.name.isin(RETIRED.Retired)
        ALL = MERGE[~mask]
    except FileNotFoundError:
        ALL = MERGE
        
    print(f'Found {len(ALL)} authors')
    
    # Get Missing SINTA ID
    missing_SINTAID = ALL[pd.isnull(ALL.id)]
    if len(missing_SINTAID) == 0:
        pass
    else:
        print(f'**{len(missing_SINTAID)} authors missing SINTA id. Finding missing ids...')
        nama = missing_SINTAID.name
        output_missing_SINTAID = pd.DataFrame()
        for num, i in enumerate(nama):
            print(f'({num+1}/{len(nama)}) {i}')
            data = search_author(i)
            if data != []:
                data = pd.DataFrame(data)
                data = data[data.affiliation == affil_name_long]
                output_missing_SINTAID = output_missing_SINTAID.append(data)
                        # Add missing SINTA id
                for i in ALL[ALL.id.isnull()].index:
                    x = ALL.loc[i, 'name'].upper()
                    if output_missing_SINTAID.name.isin([x]).any():
                        new_id = output_missing_SINTAID[output_missing_SINTAID.name.isin([x])].id
                        ALL.loc[i, 'id'] = new_id.values[0]
            else:
                print(f'[WARNING]: {i} not found in SINTA Database')

    # Get Missing NIDN
    missing_NIDN = ALL[pd.isnull(ALL.NIDN)]
    if len(missing_NIDN) == 0:
        pass
    else:
        print(f'**{len(missing_NIDN)} authors missing NIDN. Finding missing NIDN...')
        nama = missing_NIDN.name
        output_missing_NIDN = pd.DataFrame()
        for num, i in enumerate(nama):
            print(f'({num+1}/{len(nama)}) {i}')
            data = search_author(i)
            if data != []:
                data = pd.DataFrame(data)
                data = data[data.affiliation == affil_name_long]
                output_missing_NIDN = output_missing_NIDN.append(data)

                # Add missing NIDN
                for i in ALL[ALL.NIDN.isnull()].index:
                    x = ALL.loc[i, 'name'].upper()
                    if output_missing_NIDN.name.isin([x]).any():
                        new_id = output_missing_NIDN[output_missing_NIDN.name.isin([x])].NIDN
                        ALL.loc[i, 'NIDN'] = new_id.values[0]
                output_missing_NIDN = output_missing_NIDN[output_missing_NIDN.id.isin(ALL.id)]
            else:
                print(f'[WARNING]: {i} not found in SINTA Database')

        #output_missing_NIDN

            
    # Drop null and clean
    ALL.to_csv(f'../data/{affil_name.upper()}/{affil_name}_data_NIDN-versi-sinta.csv')
    ALL = ALL[~ALL.id.isnull()]
    ALL = ALL.reset_index(drop = True)
    ALL.NIDN = [i.replace(' ','') for i in ALL.NIDN]
    ALL.to_csv(f'../tables/{affil_name}_bio_clean.csv')
    print(f'Total authors from {affil_name_long}: {len(ALL)}')
    
    # get metadata from SINTA
    print(f'**Getting information from SINTA...')
    df = pd.read_csv(f'../tables/{affil_name}_bio_clean.csv', index_col=0, dtype=str)
    sinta_get = sinta.authors(df.id)
    df2 = pd.DataFrame(sinta_get)
    df = df.merge(df2, left_on='id', right_on='id')
    df['Instansi'] = affil_name_long
    df.to_csv(f'../tables/{affil_name}_bio_ready.csv')
    print(f'[JOB DONE]: {affil_name_long}')
    return df

In [3]:
univ = [i.lower() for i in os.listdir('../data') if not i.startswith('.')]
univ

['ugm', 'ub', 'itb', 'ui', 'ipb']

In [4]:
for u in univ:
    sinta_life_sciences(u)

**Gathering information for UGM...
{'affil_name': 'ugm', 'affil_name_long': 'Universitas Gadjah Mada', 'affil_id': 384, 'dept_id': [46201, 46101, 46001, 46202, 46102, 46002, 54111]}
**Gathering authors ids...
Found 88 authors
**1 authors missing SINTA id. Finding missing ids...
(1/1) Wiko Arif Wibowo
**1 authors missing NIDN. Finding missing NIDN...
(1/1) Wiko Arif Wibowo
Total authors from Universitas Gadjah Mada: 87
**Getting information from SINTA...
[JOB DONE]: Universitas Gadjah Mada
**Gathering information for UB...
{'affil_name': 'ub', 'affil_name_long': 'Universitas Brawijaya', 'affil_id': 404, 'dept_id': ['46201', '46101', '46001']}
**Gathering authors ids...
Found 33 authors
**10 authors missing NIDN. Finding missing NIDN...
(1/10) Rodiyati Azrianingsih
(2/10) Nunung Harijati
(3/10) Aris Soewondo
(4/10) Eko Suyanto
(5/10) Setijono Samino
(6/10) Aminatun Munawarti
(7/10) Viky Vidayanti
(8/10) Muhamad Imam
(9/10) Tri Ardyati
(10/10) Suharjono
Total authors from Universitas Braw