In [None]:
import pandas as pd
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import platform
import gc
import sys
import argparse
from glob import glob
from google.colab import drive
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
import re
import random
import requests
import urllib.request
import json
from copy import deepcopy
import copy
from dataclasses import dataclass
from tqdm import tqdm
tqdm.pandas()

from konlpy.tag import Mecab
import transformers
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerFast
from datasets import load_dataset
from trl import DPOTrainer, SFTTrainer
import bitsandbytes as bnb
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
import logging
logger = logging.getLogger(__name__)
from typing import Optional, Dict, Sequence
from Korpora import Korpora
from Korpora import KowikiTextKorpus, KorNLIKorpus
# from googletrans import Translator
from dask import bag, diagnostics

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [None]:
def print_system_specs():
    # Check if CUDA is available
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)
# Get the number of available CUDA devices
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)
    if is_cuda_available:
        for i in range(num_cuda_devices):
            # Get CUDA device properties
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")
    # Get CPU information
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

In [None]:
!nvidia-smi

In [None]:
drive.mount('/content/drive')

In [None]:
# Load to Data
data_location = '/content/drive/MyDrive/llm'
data_path = Path(data_location)

train = pd.read_csv(data_path / 'train.csv')

In [None]:
if __name__ == '__main__':
  parser = argparse.ArgumentParser(description = 'paul77ms')
  parser.add_argument('--st_model_name', default = None, type = str)
  parser.add_argument('--tok_model_name', default = None, type = str)
  parser.add_argument('--model_name', default = None, type = str)
  parser.add_argument('--optimizer', default = 'adamw', type = str)
  parser.add_argument('--learning_rate', default = 1e-4, type = float)
  parser.add_argument('--batch_size', default = 32, type = int)
  parser.add_argument('--epochs', default = 10, type = int)
  parser.add_argument('--seed', default = 0, type = int)
  parser.add_argument('--shuffle', default = True, type = bool)
  parser.add_argument('--num_workers', default = 0, type = int)
  args = parser.parse_args('')

  OPTIMIZER = args.optimizer
  BATCH_SIZE = args.batch_size
  LEARNING_RATE = args.learning_rate
  EPOCHS = args.epochs
  SEED = args.seed
  SHUFFLE = args.shuffle
  NUM_WORKERS = args.num_workers


  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  def Seed_Fixer(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    pl.seed_everything(SEED)

  Seed_Fixer()

  # Sentence Transformer
  # https://github.com/jhgan00/ko-sentence-transformers
  args.st_model_name = 'nli'
  if 'ko-sroberta-multitask' in args.st_model_name:
    ST_MODEL = SentenceTransformer('jhgan/ko-sroberta-multitask').to(device)
  elif 'ko-sbert-multitask' in args.st_model_name:
    ST_ModEL = SentenceTransformer('jhgan/ko-sbert-multitask').to(device)
  elif 'nli' in args.st_model_name:
    ST_MODEL = SentenceTransformer('jhgan/ko-sbert-nli').to(device)
  elif 'sts' in args.st_model_name:
    ST_MODEL = SentenceTransformer('jhgan/ko-sbert-sts').to(device)
  elif 'klue' in args.st_model_name:
    ST_MODEL = SentenceTransformer('klue/robert-base').to(device)
  elif 'klue_small' in args.st_model_name:
    ST_MODEL = SentenceTransformer('klue-roberta-small-nli-sts').to(device)
  elif 'huffon' in args.st_model_name:
    ST_MODEL = SentenceTransformer('Huffon/sentence-klue-roberta-base').to(device)
  elif 'bert' in args.st_model_name:
    ST_MODEL = SentenceTransformer('kykim/bert-kor-base').to(device)

  # Inference Sentence Transformer
  INFERENCE_ST_MODEL = SentenceTransformer('distiluse-base-multilingual-cased-v1').to(device)

  # Tokenizer & Large Language Model (LLM)
  args.model_name = 'yanolja'
  if 'mistral' in args.model_name:
    model_name = 'davidkim205/komt-mistral-7b-v1'
  elif 'upstage_solar' in args.model_name:
    model_name = 'Upstage/SOLAR-10.7B-v1.0'
  elif 'solar2' in args.model_name:
    model_name = 'Edentns/DataVortexS-10.7B-dpo-v1.0'
  elif 'solar1' in args.model_name:
    model_name = 'LDCC/LDCC-SOLAR-10.7B'
  elif 'koalpaca' in args.model_name:
    model_name = 'mncai/Mistral-7B-v0.1-alpaca-1k'
  elif 'cokal' in args.model_name:
    model_name = 'HumanF-MarkrAI/COKAL-DPO-13b-v2'
  elif 'koalpaca_5.8b' in args.model_name:
    model_name = 'beomi/KoAlpaca-Polyglot-5.8B'
  elif 'qlora_koalpaca_12.8b' in args.model_name:
    model_name = 'beomi/qlora-koalpaca-polyglot-12.8b-50step'
  elif 'koalpaca_12.8b' in args.model_name:
    model_name = 'EleutherAI/polyglot-ko-12.8b'
  elif 'yanolja' in args.model_name:
    model_name = 'yanolja/KoSOLAR-10.7B-v0.2'

In [None]:
# SentanceTransformer

def encode_question_sentence_transformer(df, model, device):
  question1 = '질문_1'
  question2 = '질문_2'
  embeddings = []

  for _, row in tqdm(df.iterrows(), total=len(df)):
    sentences = row[question1] + ' ' + row[question2]
    sentence_embeddings = model.encode(sentences, device=device).astype(np.float16)
    embeddings.append(sentence_embeddings)

  return np.array(embeddings)

In [None]:
embs = encode_question_sentence_transformer(train, ST_MODEL, device)

In [None]:
# Re-Ranking

def rerank_answers(df, question_embeddings, model, device):
  answers = ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']
  reranked_df = df.copy()

  for q_question_embedding_idx in tqdm(range(len(question_embeddings) // 2)):
    answer_embeddings = model.encode(reranked_df.iloc[q_question_embedding_idx * 2][answers].tolist(), device=device)

    for q_idx in range(2):
      q_question_embedding = question_embeddings[q_question_embedding_idx * 2 + q_idx].reshape(1, -1)

      similarities = cosine_similarity(q_question_embedding, answer_embeddings)[0]
      reranked_answers = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)
      reranked_answers_indices = [idx for idx, _ in reranked_answers]

      for idx, answer in enumerate(answers):
        rank_col = f'{answer}_rank'
        reranked_df.at[q_question_embedding_idx * 2 + q_idx, rank_col] = reranked_answers_indices[idx] + 1

  return reranked_df

In [None]:
train_reranked = rerank_answers(train, embs, ST_MODEL, device)

In [None]:
# Re-Ranking Filtered Answers Augmentation

N = 3
for idx in range(N):
  train_reranked[f'reranked{idx+1}'] = np.nan
reranked_columns = train_reranked.columns[train_reranked.columns.str.endswith('_rank')]

for rank_col in reranked_columns:
  answer_col = rank_col.replace('_rank', '')
  for idx in range(N):
    mask = train_reranked[rank_col] == idx+1

    train_reranked.loc[train_reranked[mask][answer_col].index, f'reranked{idx+1}'] = train_reranked[mask][answer_col]

In [None]:
train_reranked.head()