# Predibase
- LLM FineTuning

In [None]:
! pip install -q predibase datasets

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig

pb = Predibase(api_token="pb_LIFXE4rhVw-aiwN8cuq94A")

lorax_client = pb.deployments.client("mistral-7b-instruct-v0-2") # Insert deployment name here
resp = lorax_client.generate("[INST] What are some popular tourist spots in San Francisco? [/INST]")
print(resp.generated_text)

Connected to Predibase as User(id=93892638-e454-4e9d-8b14-8c96ffa98fbe, 
username=shdpwls0114@sju.ac.kr)


KeyboardInterrupt: 

In [None]:
from datasets import load_dataset_builder
import csv

def hfdataset_to_csv(datalist: list, csv_file_name, max=-1):
  template={
      "prompt":
      """<|im_start|>system\nThe following passage is content from a news report. Please summarize this passage in one sentence or less.<|im_end|>\n<|im_start|>user\n{input_text}\n<|im_end|>
      <|im_start|>passage\n {content}
      <|im_start|>summary\n""",
      "completion":"{headline}<|im_end|>",
      "split":"train"}

  with open(csv_file_name,'w',newline='') as csvfile:
    fieldnames = template.keys()
    writer=csv.DictWriter(csvfile,fieldnames=fieldnames)
    writer.writeheader()

    for i,d in enumerate(datalist):
      if i>=max:
        break

In [None]:
def validate_data_csv(csv_file_name):
  """make sure it has prompt, completion, and split with all values """
  with open(csv_file_name,'r') as f:
    reader=csv.DictReader(f)
    for row in reader:
      assert row['prompt']
      assert row['completion']
      assert row['split']

  return True

In [None]:
from tokenizers import Tokenizer
tokenizer=Tokenizer.from_pretrained('upstage/solar-1-mini-tokenizer')

def compute_cost(csv_file_name,price_per_million_tokens=0.5):
  """compute the cost of the dataset"""

  total_num_of_tokens=0
  with open(csv_file_name,'r') as f:
    reader=csv.DictReader(f)
    # get all values
    values=[row['completion']+" "+row['prompt'] for row in reader]
    for value in values:
      enc=tokenizer.encode(value)
      num_of_tokens=len(enc.tokens)
      total_num_of_tokens+=num_of_tokens

  return total_num_of_tokens/1000000*price_per_million_tokens

In [None]:
import time
from datasets import load_dataset

hfdataset=load_dataset('JulesBelveze/tldr_news',trust_remote_code=True)
train_hfdatast=hfdataset['train']
test_hfdataset=hfdataset['test']

dataset_name='tldr_news_toy'
csv_file_name=f"{dataset_name}.csv"
try:
  pd_dataset=pb.datasets.get(dataset_name)
  print(f"Dataset found: {pd_dataset}")
except RuntimeError:
  print("Dataset not found, creating...")

  hfdataset_to_csv(train_hfdatast,csv_file_name,max=300)

  print(f'Dataset Validation: {validate_data_csv(csv_file_name)}')
  print(f'One step FT cost: {compute_cost(csv_file_name)} USD')

  print("uploading dataset...")
  pb_dataset=pb.datasets.from_file(csv_file_name,name=dataset_name)

In [None]:
# Create an adapter repository
repo_name='news-summarizer-model'
repo=pb.repos.create(name=repo_name,description='TLDR News Summarizer Experiments',exists_ok=True)
print(repo)

In [None]:
# Start a fine-tuning job, blocks until training is finished
adapter=pb.adapters.create(
    config=FinetuningConfig(
        base_model='solar-mini-chat-240612',
        epochs=1, # default=3
        rank=1, # default=16
    ),
    dataset=pb_dataset,  # Also accepts the dataset name as a string
    repo=repo,
    description="initial model with defaults"
)

In [None]:
adapter

In [None]:
adapter_id=adapter.repo+'/'+str(adapter.tag)
adapter_id

In [None]:
# Get adapter, blocking call if training is still in progress
adapter=pb.adapters.get(adapter_id)
adapter

In [None]:
input_prompt="""
<|im_start|>system
The following passage is content from a news report. Please summarize this passage in one sentence or less.<|im_end|>
<|im_start|>user

##mounting

In [None]:
#코랩 마운팅
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/MyDrive/공모전

[Errno 2] No such file or directory: 'drive/MyDrive/공모전'
/content/drive/MyDrive/공모전


In [None]:
! pip install pytesseract
! pip install pdf2image

!apt-get install tesseract-ocr
!apt-get install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


## 자연어처리: PDF에서 하이라이트 정보 추출하기

In [None]:
import fitz  # PyMuPDF
import re  # 정규 표현식 모듈

def extract_highlighted_text(pdf_path):
    doc = fitz.open(pdf_path)  # PDF 파일을 엽니다.
    highlighted_texts = []  # 추출된 텍스트를 저장할 리스트입니다.

    for page in doc:  # 각 페이지에 대해 반복합니다.
        for annot in page.annots():  # 페이지의 모든 주석(하이라이트 포함)을 순회합니다.
            if annot.type[0] == 8:  # 하이라이트 주석의 타입 코드입니다.
                quad_points = annot.vertices  # 하이라이트 영역의 꼭짓점 정보입니다.
                text = ""
                # 하이라이트 영역의 텍스트를 추출합니다.
                for i in range(0, len(quad_points), 4):
                    rect = fitz.Rect(quad_points[i], quad_points[i+3])  # 꼭짓점으로 사각형을 정의합니다.
                    text += page.get_text("text", clip=rect)  # 정의된 사각형 내의 텍스트를 추출합니다.
                highlighted_texts.append(text)  # 추출된 텍스트를 리스트에 추가합니다.

    doc.close()  # PDF 문서를 닫습니다.
    return highlighted_texts

def parse_highlighted_text(texts):
    pattern = re.compile(r'([^:]+):\s*(.*)')  # '필드명: 값' 형식을 찾기 위한 정규 표현식
    data_dict = {}
    for text in texts:
        matches = pattern.findall(text)
        for match in matches:
            data_dict[match[0].strip()] = match[1].strip()
    return data_dict

# 파일 경로를 지정하고 함수를 호출합니다.
pdf_path = '석경에이티_분기보고서.pdf'
highlights = extract_highlighted_text(pdf_path)
parsed_data = parse_highlighted_text(highlights)

for key, value in parsed_data.items():
    print(f"{key}: {value}")


In [None]:
!pip install openai

In [None]:
from openai import OpenAI # openai==1.2.0

client = OpenAI(
  api_key="up_L4wu6Az9MjVfUQH7FLEUwozfvfJZa",
  base_url="https://api.upstage.ai/v1/solar"
)

stream = client.chat.completions.create(
  model="solar-1-mini-chat",
  messages=[
    {
      "role": "system",
      "content": "You are a helpful assistant."
    },
    {
      "role": "user",
      "content": "please summarize the following text: '기술 내용: 산화 마그네슘, 수산화 마그네슘 등 알루미나 대체 고열전도도/경량화 소재 개발, 고열전도도 및 경량화 기술, Size별 Granulation 조절 기술, 응집 제어 기술"\
    }
  ],
  stream=True,
)

for chunk in stream:
  if chunk.choices[0].delta.content is not None:
    print(chunk.choices[0].delta.content, end="")

#Solar-Upstage

## Document OCR

In [None]:
import requests

api_key = "up_L4wu6Az9MjVfUQH7FLEUwozfvfJZa"
filename = "석경에이티_분기보고서.pdf"

url = "https://api.upstage.ai/v1/document-ai/ocr"
headers = {"Authorization": f"Bearer {api_key}"}
files = {"document": open(filename, "rb")}
response = requests.post(url, headers=headers, files=files)
print(response.json())

{'apiVersion': '1.1', 'confidence': 0.9379, 'metadata': {'pages': [{'height': 3509, 'page': 1, 'width': 2480}, {'height': 3509, 'page': 2, 'width': 2480}, {'height': 3509, 'page': 3, 'width': 2480}, {'height': 3509, 'page': 4, 'width': 2480}, {'height': 3509, 'page': 5, 'width': 2480}, {'height': 3509, 'page': 6, 'width': 2480}, {'height': 3509, 'page': 7, 'width': 2480}, {'height': 3509, 'page': 8, 'width': 2480}, {'height': 3509, 'page': 9, 'width': 2480}, {'height': 3509, 'page': 10, 'width': 2480}, {'height': 3509, 'page': 11, 'width': 2480}, {'height': 3509, 'page': 12, 'width': 2480}, {'height': 3509, 'page': 13, 'width': 2480}, {'height': 3509, 'page': 14, 'width': 2480}, {'height': 3509, 'page': 15, 'width': 2480}, {'height': 3509, 'page': 16, 'width': 2480}, {'height': 3509, 'page': 17, 'width': 2480}, {'height': 3509, 'page': 18, 'width': 2480}, {'height': 3509, 'page': 19, 'width': 2480}, {'height': 3509, 'page': 20, 'width': 2480}, {'height': 3509, 'page': 21, 'width': 2480

## PDF정리

### 긴 PDF를 짧은 PDF로 분할

In [None]:
!pip install pymupdf


Collecting pymupdf
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from pymupdf)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.9 pymupdf-1.24.9


In [None]:
import os
import fitz

def split_pdf(input_file, batch_size):
    # Open input_pdf
    input_pdf = fitz.open(input_file)
    num_pages = len(input_pdf)
    print(f"Total number of pages: {num_pages}")

    # Split input_pdf
    for start_page in range(0, num_pages, batch_size):
        end_page = min(start_page + batch_size, num_pages) - 1

        # Write output_pdf to file
        input_file_basename = os.path.splitext(input_file)[0]
        output_file = f"{input_file_basename}_{start_page}_{end_page}.pdf"
        print(output_file)
        with fitz.open() as output_pdf:
            output_pdf.insert_pdf(input_pdf, from_page=start_page, to_page=end_page)
            output_pdf.save(output_file)

    # Close input_pdf
    input_pdf.close()

# Input arguments
input_file = "석경에이티_분기보고서.pdf"  # Replace with a file of your own
batch_size = 10  # Maximum available value is 100
split_pdf(input_file, batch_size)

Total number of pages: 147
석경에이티_분기보고서_0_9.pdf
석경에이티_분기보고서_10_19.pdf
석경에이티_분기보고서_20_29.pdf
석경에이티_분기보고서_30_39.pdf
석경에이티_분기보고서_40_49.pdf
석경에이티_분기보고서_50_59.pdf
석경에이티_분기보고서_60_69.pdf
석경에이티_분기보고서_70_79.pdf
석경에이티_분기보고서_80_89.pdf
석경에이티_분기보고서_90_99.pdf
석경에이티_분기보고서_100_109.pdf
석경에이티_분기보고서_110_119.pdf
석경에이티_분기보고서_120_129.pdf
석경에이티_분기보고서_130_139.pdf
석경에이티_분기보고서_140_146.pdf


### 레이아웃 분석
- HTML로 변환

In [None]:
! pip install requests



In [None]:
from glob import glob
import json
import os
import requests

API_KEY = "up_L4wu6Az9MjVfUQH7FLEUwozfvfJZa"  # Change this to your API key

def call_layout_analysis(input_file, output_file):
    # Send request
    response = requests.post(
        "https://api.upstage.ai/v1/document-ai/layout-analysis",
        headers={"Authorization": f"Bearer {API_KEY}"},
        data={"ocr": False},
        files={"document": open(input_file, "rb")})

    # Save response
    if response.status_code == 200:
        with open(output_file, "w") as f:
            json.dump(response.json(), f, ensure_ascii=False)
    else:
        raise ValueError(f"Unexpected status code {response.status_code}.")

# Find all shorter PDFs related to input_file
input_file = "석경에이티_분기보고서.pdf"
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")

# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
    print(short_input_file)
    short_output_file = os.path.splitext(short_input_file)[0] + ".json"
    call_layout_analysis(short_input_file, short_output_file)

### 일부 요소 자르기

In [None]:
"""
Requirements:
- `pip install pymupdf` to import fitz
- `pip install pillow` to import PIL
"""

import json
import fitz

from PIL import Image

def get_page_sizes(data):
    """Get the size of each page."""
    page_sizes = {}
    for page_element in data["metadata"]["pages"]:
        width = page_element["width"]
        height = page_element["height"]
        page_num = page_element["page"]
        page_sizes[page_num] = [width, height]
    return page_sizes


def pdf2image(input_file, page_num, dpi=300):
    """Open PDF file and convert to image."""
    doc = fitz.open(input_file)
    page = doc[page_num-1].get_pixmap(dpi=dpi)

    target_page_size = [page.width, page.height]
    page_img = Image.frombytes("RGB", target_page_size, page.samples)
    doc.close()
    return page_img


def normalize_coordinates(coordinates, output_page_size):
    # Get x-y coordinates and find min/max values
    x_values = [coord["x"] for coord in coordinates]
    y_values = [coord["y"] for coord in coordinates]
    x1, y1, x2, y2 = min(x_values), min(y_values), max(x_values), max(y_values)

    # normalize coordinates and map to page size
    x1 /= output_page_size[0]
    y1 /= output_page_size[1]
    x2 /= output_page_size[0]
    y2 /= output_page_size[1]
    return x1, y1, x2, y2


def crop_image(img, coordinates, output_file):
    x1, y1, x2, y2 = coordinates
    img_width, img_height = img.size
    x1 = int(x1 * img_width)
    y1 = int(y1 * img_height)
    x2 = int(x2 * img_width)
    y2 = int(y2 * img_height)
    target_coordinates = (x1, y1, x2, y2)

    cropped_img = img.crop(target_coordinates)
    cropped_img.save(output_file)

# Input parameters
input_file = "paper_0_9.pdf"
json_file = "paper_0_9.json"
output_file = "paper_cropped_figure.png"

# Load JSON file to get bounding box of the first figure
with open(json_file, "r") as f:
    data = json.load(f)

    # Get the size of each page
    page_sizes = get_page_sizes(data)

    # Get bounding box for the first figure and crop the image
    for element in data["elements"]:
        if element["category"] == "figure":
            page_num = element["page"]
            coordinates = element["bounding_box"]
            output_page_size = page_sizes[page_num]
            pdf_image = pdf2image(input_file, page_num)
            normalized_coordinates = normalize_coordinates(coordinates, output_page_size)
            crop_image(pdf_image, normalized_coordinates, output_file)
            break

FileNotFoundError: [Errno 2] No such file or directory: 'paper_0_9.json'

## Embeddings

In [None]:
import numpy as np
from openai import OpenAI

client = OpenAI(
    api_key="up_L4wu6Az9MjVfUQH7FLEUwozfvfJZa",
    base_url="https://api.upstage.ai/v1/solar"
)

query_result = client.embeddings.create(
    model = "solar-embedding-1-large-query",
    input = "What makes Solar LLM small yet effective?"
).data[0].embedding

document_result = client.embeddings.create(
    model = "solar-embedding-1-large-passage",
    input = "SOLAR 10.7B: Scaling Large Language Models with Simple yet Effective Depth Up-Scaling. DUS is simple yet effective in scaling up high performance LLMs from small ones. "
).data[0].embedding

similarity = np.dot(np.array(query_result), np.array(document_result))
print(f"Similarity between query and document: {similarity}")

Similarity between query and document: 0.3974243426178674


## Chat

In [None]:
from openai import OpenAI # openai==1.2.0

client = OpenAI(
  api_key="up_L4wu6Az9MjVfUQH7FLEUwozfvfJZa",
  base_url="https://api.upstage.ai/v1/solar"
)

stream = client.chat.completions.create(
  model="solar-1-mini-chat",
  messages=[
    {
      "role": "system",
      "content": "Summarize this sentence."
    },
    {
      "role": "user",
      "content": "기술 내용- 입자 사이즈 조절 기술(Ultra Fine ~ 20μm) - 불순물 제거를 위한 정제 기술- 조영성(X-Ray 불투과성) 소재 개발- 수지와 혼합 후 경화시 투명성 확보"
    }
  ],
  stream=True,
)

for chunk in stream:
  if chunk.choices[0].delta.content is not None:
    print(chunk.choices[0].delta.content, end="")

# Use with stream=False
# print(stream.choices[0].message.content)

해당 문장은 의료 영상 진단을 위한 구강 내 조영제 개발에 필요한 기술 내용을 요약한 것입니다. 입자 사이즈 조절 기술을 사용하여 입자 크기를 미세하게 조절하여 20μm 이하로 만들어야 합니다. 이를 통해 불순물을 효과적으로 제거하여 고순도의 조영제를 얻을 수 있습니다. 또한, 조영성을 부여하기 위해 X-Ray 불투과성 소재를 개발해야 합니다. 마지막으로, 수지와 혼합 후 경화시 투명성을 확보하여 영상 진단에 필요한 조건을 충족시켜야 합니다.