In [30]:
from serpapi import GoogleSearch
import os

import requests
from bs4 import BeautifulSoup
import re

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
import os
from deeplake.core.vectorstore import VectorStore

import openai


def search_news():
    search = GoogleSearch({
        "engine": "google",
        "q": "Berita kesehatan health terkini penyakit",
        "location_requested": "Indonesia",
        "location_used": "Indonesia",
        "google_domain": "google.co.id",
        "hl": "id",
        "gl": "id",
        "device": "desktop",
        "tbm": "nws",
        "num": "10",
        "api_key": os.getenv("SERP_API_KEY")
    })

    result = search.get_dict()

    return result


def embedding_function(texts, model="text-embedding-ada-002"):
   if isinstance(texts, str):
       texts = [texts]

   texts = [t.replace("\n", " ") for t in texts]
   return [data['embedding']for data in openai.Embedding.create(input = texts, model=model)['data']]


def get_news_content():
    res = search_news()

    links = [x["link"] for x in res["news_results"]]

    contents = dict()
    for l in links:
        r = requests.get(l)

        if r.status_code != 200:
            continue

        soup = BeautifulSoup(r.content, 'html.parser')
        text = re.sub(r'\s+', ' ', soup.text.replace("\n", " "))

        contents[l] = text
    
    return contents


def store_and_embed_news(contents, chunk_size = 1000):
    dataset_path = 'hub://luisfrentzen/data'
    vector_store = VectorStore(
        path = dataset_path,
    )
    
    for s, c in contents.items():
        chunked_text = [c[i:i+1000] for i in range(0,len(c), chunk_size)]

        print(s)
        vector_store.add(text = chunked_text, 
                        embedding_function = embedding_function, 
                        embedding_data = chunked_text, 
                        metadata = [{"source": s}]*len(chunked_text))

In [22]:
cnt = get_news_content()

In [24]:
cnt

{'http://beritamagelang.id/pemkab-magelang-dorong-pembentukan-tim-one-health-kabupaten-magelang-cegah-penyakit-zoonosis': ' Berita Magelang - Pemkab Magelang Dorong Pembentukan Tim One Health Kabupaten Magelang Cegah Penyakit Zoonosis Menu Tulis Artikel Beranda Informasi Publik Kolom Wawancara Tokoh Foto Video News Kesehatan Edukasi Olahraga Wisata & Kuliner Komoditas Bencana Beranda Pemkab Magelang Dorong Pembentukan Tim One Health Kabupaten Magelang Cegah Penyakit Zoonosis Pemkab Magelang Dorong Pembentukan Tim One Health Kabupaten Magelang Cegah Penyakit Zoonosis 02 Agustus 2023 13:23 Remmy Saputra Dilihat 1128 kali Sekretaris Daerah Kabupaten Magelang didampingi Kepala Dinas Kesehatan, Kepala Dispeterikan dan Provincial Coordinator AIHSP Jawa Tengah dr. Hartanto saat membuka acara Lokakarya Pembentukan Tim One Health Kabupaten Magelang. BERITAMAGELANG.ID- Sekretaris Daerah Kabupaten Magelang Adi Waryanto bersama Kepala Dinas Kesehatan Sunaryo, Kepala Dinas Peternakan dan Perikanan 

In [31]:
store_and_embed_news(cnt)

Your Deep Lake dataset has been successfully created!


 

http://beritamagelang.id/pemkab-magelang-dorong-pembentukan-tim-one-health-kabupaten-magelang-cegah-penyakit-zoonosis


Creating embedding data: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (7, 1536)  float32   None   
    id        text      (7, 1)      str     None   
 metadata     json      (7, 1)      str     None   
   text       text      (7, 1)      str     None   
https://ugm.ac.id/id/berita/23086-hasil-survei-i-namhs-satu-dari-tiga-remaja-indonesia-memiliki-masalah-kesehatan-mental/


Creating embedding data: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (18, 1536)  float32   None   
    id        text      (18, 1)      str     None   
 metadata     json      (18, 1)      str     None   
   text       text      (18, 1)      str     None   
https://www.bandung.go.id/news/read/7726/jokowi-resmikan-rumah-sakit-mayapada-kota-bandung


Creating embedding data: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (22, 1536)  float32   None   
    id        text      (22, 1)      str     None   
 metadata     json      (22, 1)      str     None   
   text       text      (22, 1)      str     None   
https://health.detik.com/berita-detikhealth/d-6735498/who-wanti-wanti-ancaman-pandemi-baru-susul-covid-19-ini-penyakit-yang-disoroti


Creating embedding data: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (28, 1536)  float32   None   
    id        text      (28, 1)      str     None   
 metadata     json      (28, 1)      str     None   
   text       text      (28, 1)      str     None   
https://health.grid.id/read/353676171/11-isu-kesehatan-jadi-sorotan-di-2023-mulai-long-covid-hingga-populasi-lanjut-usia-yang-meningkat?page=all


Creating embedding data: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (41, 1536)  float32   None   
    id        text      (41, 1)      str     None   
 metadata     json      (41, 1)      str     None   
   text       text      (41, 1)      str     None   
http://p2p.kemkes.go.id/penguatan-sistem-kesehatan-dalam-pengendalian-covid-19/


Creating embedding data: 100%|██████████| 3/3 [00:02<00:00,  1.15it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (63, 1536)  float32   None   
    id        text      (63, 1)      str     None   
 metadata     json      (63, 1)      str     None   
   text       text      (63, 1)      str     None   
https://www.antaranews.com/berita/3144461/penanggulangan-penyakit-rabies-gunakan-pendekatan-one-health


Creating embedding data: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (83, 1536)  float32   None   
    id        text      (83, 1)      str     None   
 metadata     json      (83, 1)      str     None   
   text       text      (83, 1)      str     None   
https://health.detik.com/berita-detikhealth/d-6791029/riwayat-kesehatan-jet-li-sempat-sakit-sampai-vakum-akting-gegara-penyakit-ini


Creating embedding data: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (88, 1536)  float32   None   
    id        text      (88, 1)      str     None   
 metadata     json      (88, 1)      str     None   
   text       text      (88, 1)      str     None   
https://www.kompas.id/baca/humaniora/2023/05/03/krisis-kesehatan-mental-melonjak-di-kalangan-remaja


Creating embedding data: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (96, 1536)  float32   None   
    id        text      (96, 1)      str     None   
 metadata     json      (96, 1)      str     None   
   text       text      (96, 1)      str     None   
https://dinkes.jakarta.go.id/berita/read/lima-tahapan-pencegahan-penyakit-di-rumah-sehat-jakarta


Creating embedding data: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
 

Dataset(path='hub://luisfrentzen/data', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype       shape      dtype  compression
  -------    -------     -------    -------  ------- 
 embedding  embedding  (102, 1536)  float32   None   
    id        text      (102, 1)      str     None   
 metadata     json      (102, 1)      str     None   
   text       text      (102, 1)      str     None   
