##### Notebook ini berfungsi untuk mengambil data yang terdapat pada Google Scholar dan menggabungkan data yang sudah ditentukan di dalam spreadsheet dan kemudian dibentuk menjadi sebuah file JSON yang berisi data jurnal-jurnal setiap dosen. 
##### Untuk penggunaannya hanya perlu memasukan URL Google Scholar orang yang diinginkan di variabel profileURL dan kemudian run seluruh cell 
###### Notebook ini diintensikan untuk penggunaan pribadi dan tidak pernah diintensikan untuk dibagikan sehingga akan ada beberapa hal yang sulit untuk dimengerti akibat penamaannya ataupun penulisannya



In [2]:
from openpyxl import load_workbook
from bs4 import BeautifulSoup
import requests, lxml, os, json
import datetime as dt
import pandas as pd

- Informatika (done)
- Sistem Informasi (on going)
- Teknik Komputer (done)
- Teknik Fisika (done)
- Teknik Elektro (done)
- Ilkom (done)
- DKV (done)
- Manajemen (done)
- Akuntansi (done)
- Perhotelan (done)
- MMT (done)
- Arsitektur (done)
- Jurnalistik (done)
- Film (done)

In [3]:
"""
Dalam spreadsheet, terdapat sheet-sheet yang dinamakan berdasarkan jurusan
"""
sheet_name = "Sistem Informasi"
sheet = pd.read_excel("DATA DOSEN - ID Google Scholar.xlsx", sheet_name=sheet_name)
sheet.fillna("", inplace=True)
profileURL = "https://scholar.google.com/citations?hl=id&user=iTeTouEAAAAJ"

In [4]:
#setup beautiful soup

headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

proxies = {
  'http': os.getenv('HTTP_PROXY')
}

html = requests.get(profileURL+"&cstart=0&pagesize=1000", headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')

In [5]:
# get summary section

nama = soup.select_one("#gsc_prf_i #gsc_prf_in").text
label = []
warningNa = False
try:
    for x in soup.select('#gsc_prf_i .gsc_prf_inta'):
        label.append(x.text)
    totalCite = soup.findAll("td", {"class": "gsc_rsb_std"})[0].text
    hindex = soup.findAll("td", {"class": "gsc_rsb_std"})[2].text
    i10 = soup.findAll("td", {"class": "gsc_rsb_std"})[4].text
    warning = False
except:
    totalCite = ""
    hindex = ""
    i10 = ""
    warning = True
imageURL = soup.select_one("#gsc_prf_pua #gsc_prf_pup-img")["src"]
if ".com" not in imageURL:
    imageURL = f"https://scholar.google.com{imageURL}"

lastupdate = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


In [6]:
data = {"researchers":
    {"summary": {}, 
     "publication": []
     }
    }

In [7]:
data["researchers"]["summary"]["nama"] = nama
data["researchers"]["summary"]["label"] = label
data["researchers"]["summary"]["image"] = imageURL
data["researchers"]["summary"]["citation"] = totalCite
data["researchers"]["summary"]["hindex"] = hindex
data["researchers"]["summary"]["i10index"] = i10
data["researchers"]["summary"]["lastupdate"] = lastupdate
print(data)


{'researchers': {'summary': {'nama': 'Friska Natalia Ferdinand', 'label': [], 'image': 'https://scholar.googleusercontent.com/citations?view_op=view_photo&user=iTeTouEAAAAJ&citpid=1', 'citation': '272', 'hindex': '10', 'i10index': '10', 'lastupdate': '2022-05-04 14:05:04'}, 'publication': []}}


In [8]:
# get data from google scholar if title exist in spreadsheet
# if not, publisherURL, publicURL, doi will be an empty string

for article_info in soup.select('#gsc_a_b .gsc_a_tr'):
    title = article_info.select_one('.gsc_a_t .gsc_a_at').text
    
    selectedCol = sheet.loc[sheet['Judul Artikel'] == title]
    print(selectedCol["Judul Artikel"].value_counts().to_string())
    # print(isinstance(selectedCol["Publisher URL"].iloc[0], str))
    
    # try:
    #     publisherURL = selectedCol["Publisher URL"].iloc[0]
    #     publicURL = selectedCol["Public URL"].iloc[0]
    #     doi = selectedCol["DOI"].iloc[0]
    # except IndexError:
    #     publisherURL = ""
    #     publicURL = ""
    #     doi = ""
    #     warning = True
    # print(type(publisherURL), type(publicURL), type(doi))
    
    if selectedCol.empty:
        publisherURL = ""
        publicURL = ""
        doi = ""
        warning = True
    else:
        publisherURL = selectedCol["Publisher URL"].iloc[0]
        publicURL = selectedCol["Public URL"].iloc[0] 
        doi = selectedCol["DOI"].iloc[0]
        
    
    if not publisherURL or not publicURL or not doi:
        warningNa = True
   
    authors = article_info.select_one('.gsc_a_at+ .gs_gray').text
    cite = article_info.select_one('.gsc_a_ac').text
    year = article_info.select_one('.gsc_a_y').text
    link = f"https://scholar.google.com/{article_info.select_one('.gsc_a_at')['href']}"

    articleData = {
        "title": title,
        "author": authors,
        "cited": cite,
        "year": year,
        "link": link,
        "detail": {
            "description": "",
            "public_url": publicURL,
            "author_url": profileURL,
            "DOI": doi,
            "url_publication": publisherURL
        },
    }
    data["researchers"]["publication"].append(articleData)
    

Boundary delineation of MRI images for lumbar spinal stenosis detection through semantic segmentation using deep neural networks    1
A decision making model for strategic aliance-based network design in express delivery services    1
Genetic Algorithm-based Approach to Multi Objective Decision Making Model for Strategic Alliances in Express Courier Services    1
Genetic Algorithm-based Approach to Multi Objective Decision Making Model for Strategic Alliances in Express Courier Services    1
A Fuzzy Set-Theoretic Approach to the Weak Strategic Alliance for the Survival of Multiple Service Centers in Express Courier Services    1
A Fuzzy Set-Theoretic Approach to the Weak Strategic Alliance for the Survival of Multiple Service Centers in Express Courier Services    1
A Compromised Decision Making Model for Implementing Strategic Alliances in Express Courier Services    1
Series([], )
The use of web scraping in computer parts and assembly price comparison    1
Collaborative system design

In [9]:
# write data to json file

directory = f"./output/{sheet_name}"
if not os.path.exists(directory):
    os.makedirs(directory)
fileName = profileURL.split("user=")[1]
fileName = fileName.split("&")[0]
print(fileName)
if warning: print("Some error occured")
if warningNa: print("Some nan exist")
with open(directory+f'/{fileName}2.json', 'w') as outfile:
    json.dump(data, outfile, sort_keys=True, indent=4)
print(".com" in data["researchers"]["summary"]["image"], data["researchers"]["summary"]["image"])
print(data["researchers"]["summary"]["citation"], len(data["researchers"]["publication"]))

iTeTouEAAAAJ
Some error occured
Some nan exist
True https://scholar.googleusercontent.com/citations?view_op=view_photo&user=iTeTouEAAAAJ&citpid=1
272 100
