## Imports

In [None]:
pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
from requests import get
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np, pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Scraping

## Function Title Scraping per Genre

In [35]:
pages = np.arange(1, 21)

# headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0"
headers = {"user-agent" : USER_AGENT, 'Accept-Language': 'en-US,en;q=0.8'}

#initialize empty lists to store the variables scraped
hrefs = []

def title(topic):
  for page in pages:
    
    # get request
    url = f"https://digilib.uns.ac.id/dokumen/search/{page}?q={topic}"
    response = get(url, headers=headers)

    sleep(randint(8,15))
    
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    soup = BeautifulSoup(response.text, 'html.parser')
        
    all_links = soup.find_all('div', class_='judul')
    
    for link in all_links:
      href = link.a.get('href')
      hrefs.append(href)

In [36]:
topics = ['machine+learning']
for topic in topics:
  title(topic)

In [37]:
href = pd.DataFrame({'link': hrefs})
href

Unnamed: 0,link
0,https://digilib.uns.ac.id/dokumen/detail/97633...
1,https://digilib.uns.ac.id/dokumen/detail/98779...
2,https://digilib.uns.ac.id/dokumen/detail/88107...
3,https://digilib.uns.ac.id/dokumen/detail/90281...
4,https://digilib.uns.ac.id/dokumen/detail/90628...
...,...
195,https://digilib.uns.ac.id/dokumen/detail/87649...
196,https://digilib.uns.ac.id/dokumen/detail/87569...
197,https://digilib.uns.ac.id/dokumen/detail/87643...
198,https://digilib.uns.ac.id/dokumen/detail/87644...


## Scraping Content 

In [38]:
def digilib(URL):
  # desktop user-agent
  resp = get(URL, headers= headers)

  if resp.status_code == 200:
      soup = BeautifulSoup(resp.content, "html.parser")

  table = soup.find('table')

  rows = table.find_all('tr')
  data = []
  for row in rows:
    cells = row.find_all('td')
    row_data = []
    for cell in cells:
        row_data.append(cell.text.strip())  # Extract the text content of the cell and remove any leading/trailing whitespaces
    data.append(row_data)

  df = pd.DataFrame(data)
  df = df.T

  new_header = df.iloc[0]  # Extract the first row as the new header
  df = df[1:]  # Remove the first row from the DataFrame
  df.columns = new_header  # Assign the new header to the DataFrame columns
  df = df.drop(1).reset_index(drop=True)
  columns_to_keep = ['Judul', 'Subyek', 'Abstrak', 'Pembimbing']  # Names of the columns you want to keep
  df = df.drop(df.columns.difference(columns_to_keep), axis=1)

  return df['Judul'].to_string(index=False), df['Subyek'].to_string(index=False), df['Abstrak'].to_string(index=False), df['Pembimbing'].to_string(index=False)

In [39]:
URL = 'https://digilib.uns.ac.id/dokumen/detail/76778/Analisis-Sentimen-Masyarakat-pada-Media-Sosial-Twitter-terhadap-Pilkada-Dki-Jakarta-2017-dengan-Pendekatan-Prediktif-dan-Deskriptif'
digilib(URL)

('Analisis Sentimen Masyarakat pada Media Sosial ...',
 'SENTIMENT ANALYSIS, NATURAL LANGUAGE PROCESSING...',
 'ABSTRAKTujuan utama dari penelitian ini adalah ...',
 '1. Afrizal Doewes S.Kom., M.Sc. 2. Dr. Eng Anto...')

In [40]:
all_result = []
for ind, url in enumerate(href['link']):
  print('-----------')
  print(ind, url)
  if url is not None:
    value = digilib(url)
    all_result.append(value)
  else:
    all_result.append(None)

-----------
0 https://digilib.uns.ac.id/dokumen/detail/97633/Identifikasi-Hipertensi-menggunakan-Heart-Rate-Variability-Fotopletismogram-dan-Algoritma-Machine-Learning
-----------
1 https://digilib.uns.ac.id/dokumen/detail/98779/ANALISIS-SENTIMEN-TERKAIT-TANGGAPAN-TUDUHAN-NEGATIF-PANDEMI-ATAS-COVID-19-TERHADAP-CHINA-MELALUI-AKUN-TWITTER-ZHAO-LIJIAN-DAN-HUA-CHUNYING
-----------
2 https://digilib.uns.ac.id/dokumen/detail/88107/Rasio-Keuangan-pada-Lima-Bank-di-Indonesia-dengan-Artificial-Neural-Networks
-----------
3 https://digilib.uns.ac.id/dokumen/detail/90281/Model-Prediksi-Kekeringan-Menggunakan-Machine-Learning-Kasus-di-Kabupaten-Wonogiri
-----------
4 https://digilib.uns.ac.id/dokumen/detail/90628/Tinjauan-Teknis-Studi-Supervised-Machine-Learning-dan-Implementasinya-pada-Aplikasi-Herbify-untuk-Dataset-Tanaman-Herbal
-----------
5 https://digilib.uns.ac.id/dokumen/detail/93157/Pengaruh-Variabel-Bebas-dalam-Analisis-Kapasitas-Dukung-dan-Penurunan-Fondasi-Tiang-Menggunakan-Correlation

In [41]:
final_result = pd.DataFrame(all_result, columns=['Judul', 'Subyek', 'Abstrak', 'Pembimbing'])

In [42]:
final_result

Unnamed: 0,Judul,Subyek,Abstrak,Pembimbing
0,Identifikasi Hipertensi menggunakan Heart Rate...,-,Hipertensi adalah keadaan di mana tekanan dara...,"1. Prof. Nuryani, S.Si., M.Si., Ph.D.2. Dr. Um..."
1,ANALISIS SENTIMEN TERKAIT TANGGAPAN TUDUHAN NE...,-,Covid 19 telah menginfeksi lebih dari 80 juta ...,"1. Arofah Minasari, S.S. M.A."
2,Rasio Keuangan pada Lima Bank di Indonesia den...,-,Krisis yang terjadi di Indonesia telah mengaki...,"1. Dr. Dewi Retno Sari S., S.Si., M.Kom.2. Nug..."
3,Model Prediksi Kekeringan Menggunakan Machine ...,-,Kekeringan adalah kondisi dimana kebutuhan air...,"1. Dr. Ir. RR. Rintis Hadiani, M.T.2. Ir. Adi ..."
4,Tinjauan Teknis Studi Supervised Machine Learn...,,Penggunaan teknologi dalam kehidupan sehari-ha...,"1. Dr.Eng. Aditya Rio Prabowo, S.T., M.T., M.E..."
...,...,...,...,...
195,Eksperimentasi Model PJBL dengan Menggunakandi...,,Awalinda Adji Nur Said. K2317009. EKSPERIMENTA...,"1. Dr. Sarwanto. S.Pd, M.Pd. 2. Dewanto Harjun..."
196,Engaging Students in A Digital Storytelling Pr...,-,ABSTRACTSaptiwi Rohayati. S891902020. 2022. En...,"1. Dr. Nur Arifah Drajati, M. Pd.2. Prof. Dr. ..."
197,Exploring Student’s Learning Enjoyment and Voc...,,Given that Digital Game-Based Learning has gai...,"1. Dr. Nur Arifah Drajati, M.Pd2. Ellisa Indri..."
198,Pengaruh Model Pembelajaran Problem Solving Da...,-,Penelitian ini bertujuan untuk menguji 1) perb...,"1. Prof. Dr. Sunardi, M.Sc.2. Dr. Susilaningsi..."


In [43]:
final_result.to_csv('data_digilib.csv', index=False)