In [None]:
import pandas as pd
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import platform
import gc
import sys
import argparse
from glob import glob
from google.colab import drive
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
import re
import random
import requests
import urllib.request
import json
from copy import deepcopy
import copy
from dataclasses import dataclass
from tqdm import tqdm
tqdm.pandas()

from konlpy.tag import Mecab
import transformers
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerFast
from datasets import load_dataset
from trl import DPOTrainer, SFTTrainer
import bitsandbytes as bnb
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
import logging
logger = logging.getLogger(__name__)
from typing import Optional, Dict, Sequence
from Korpora import Korpora
from Korpora import KowikiTextKorpus, KorNLIKorpus
# from googletrans import Translator
from dask import bag, diagnostics

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:
def print_system_specs():
    # Check if CUDA is available
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)
# Get the number of available CUDA devices
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)
    if is_cuda_available:
        for i in range(num_cuda_devices):
            # Get CUDA device properties
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")
    # Get CPU information
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

In [None]:
!nvidia-smi

In [None]:
drive.mount('/content/drive')

In [None]:
# Load to Data
data_location = '/content/drive/MyDrive/llm'
data_path = Path(data_location)

train = pd.read_csv(data_path / 'train.csv')

# Open API Rule

In [None]:
# Papago translator

def back_translate_papago(sentence, lang, PROB = 1):
  CLIENT_ID, CLIENT_SECRET = # 클라이언트 ID 및 클라이언트 SECRET 작성

  url = 'https://openapi.naver.com/v1/papago/n2mt'

  headers = {
    'Content-Type': 'application/json',
    'X-Naver-Client-Id': CLIENT_ID,
    'X-Naver-Client-Secret': CLIENT_SECRET
  }

  translated = {'source': 'ko', 'target': lang, 'text': sentence}

  response = requests.post(url, json.dumps(translated), headers = headers)

  back_translated = {'source': lang, 'target': 'ko', 'text': str(response.json()['message']['result']['translatedText'])}

  back_response = requests.post(url, json.dumps(back_translated), headers = headers)

  translated_back = str(back_response.json()['message']['result']['translatedText'])

  return translated_back

# parallel apply

def back_translate_parallel1(dataset, translate_column, lang, save_file = True):
  translate_bag = bag.from_sequence(dataset[translate_column].tolist()).map(lambda x: back_translate_papago(x, lang = lang))

  with diagnostics.ProgressBar():
    bag_completed = translate_bag.compute()

  dataset[f'{translate_column}_tranlsate'] = bag_completed

  return dataset

def back_translate_parallel2(dataset, translate_column, lang, save_file = True):
  try:
    dataset[f'{translate_column}_translate'] = dataset[f'{translate_column}'].progress_apply(lambda x: back_translate_papago(x, 'en'))

  except Exception as e:
    print(f"Error occurred: {str(e)}")
    if save_file:
      save_path = '/content/back_translate.csv'
      if not os.path.exists(f'{save_path}'):
        dataset.to_csv(f'{save_path}', index = False, mode = 'w', encoding = 'utf-8-sig')
      else:
        dataset.to_csv(f'{save_path}', index = False, mode = 'a', encoding = 'utf-8-sig')

  return dataset

# Cumulative save

def cumulative_storage(df, route):
  if not os.path.exists(f'{route}'):
    df.to_csv(f'{route}', idnex = False, mode = 'w', encoding = 'utf-8-sig')
  else:
    df.to_csv(f'{route}', index = False, mode = 'a', encoding = 'utf-8-sig')

# Crawling rule

In [None]:
!sudo add-apt-repository ppa:saiarcot895/chromium-beta
!sudo apt remove chromium-browser
!sudo snap remove chromium
!sudo apt install chromium-browser

!pip3 install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/

sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')

In [None]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

In [None]:
# Papago Translator Crawling Rule

def back_translate_papago(sentence, lang):
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  # webdriver_service = Service('/usr/bin/chromedriver')
  # driver = webdriver.Chrome(service = webdriver_service, options = options)
  driver = webdriver.Chrome(options = options)
  papago_url = 'https://papago.naver.com/'
  driver.get(papago_url)
  time.sleep(5)

  driver.find_element(By.CSS_SELECTOR, '#txtSource').send_keys(sentence) # #txtSource
  driver.find_element(By.CSS_SELECTOR, '#root > div > div.wrap___1rX6i.rwd.rwd___3Qe-c.banner_active___3MQbf > section > div > div:nth-child(1) > div:nth-child(3) > div > div.lang_select___3h6b5 > button').click() # button#btnTranslate
  time.sleep(5)

  driver.find_element(By.XPATH, '//*[@id="root"]/div/div[1]/section/div/div[1]/div[2]/div/div[2]/button').click()
  time.sleep(5)

  back_translated = driver.find_element(By.CSS_SELECTOR, '#txtTarget > span').text # targetEditArea#txtTarget
  time.sleep(5)

  driver.close()
  driver.quit()

  return back_translated

def back_translate_parallel2(dataset, translate_column, lang, save_file = True):
  try:
    dataset[f'{translate_column}_translate'] = dataset[f'{translate_column}'].progress_apply(lambda x: back_translate_papago(x, 'en'))
    dataset.to_csv(f'{save_path}', index = False, mode = 'w') # , encoding = 'utf-8-sig'

  except Exception as e:
    print(f"Error occurred: {str(e)}")
    if save_file:
      save_path = '/content/back_translate.csv'
      if not os.path.exists(f'{save_path}'):
        dataset.to_csv(f'{save_path}', index = False, mode = 'w') # , encoding = 'utf-8-sig'
      else:
        dataset.to_csv(f'{save_path}', index = False, mode = 'a') # , encoding = 'utf-8-sig'

  return dataset

In [None]:
train_aug = back_translate_parallel2(dataset = train, translate_column = '답변_1', lang = 'en', save_file = True)