# LLM 구현

## metadata 구조

In [1]:
import pandas as pd

In [2]:
data_= {'title':['Resource usage statistics for cloud infrastructure','Server load data during peak and off-peak hours over the last year','Customer complaint records Q1','Customer service logs Q1','Employee performance reviews 2023','Customer feedback from product launch surveys'], 'owner':['Computing Team','Resource Team','Customer Support Team','Customer Support Team','HR','Marketing Team'], 'columns':[[1,2,3,4,5],[2,4,6,1,9],['id','date','purchase','complaint','product'],['id','product','price','date','detail'],['employee id','salary','performance','grading','date'],['id','detail','date','feedback','customer_since','annual purchase']]}
metadata = pd.DataFrame(data_)

In [3]:
metadata

Unnamed: 0,title,owner,columns
0,Resource usage statistics for cloud infrastruc...,Computing Team,"[1, 2, 3, 4, 5]"
1,Server load data during peak and off-peak hour...,Resource Team,"[2, 4, 6, 1, 9]"
2,Customer complaint records Q1,Customer Support Team,"[id, date, purchase, complaint, product]"
3,Customer service logs Q1,Customer Support Team,"[id, product, price, date, detail]"
4,Employee performance reviews 2023,HR,"[employee id, salary, performance, grading, date]"
5,Customer feedback from product launch surveys,Marketing Team,"[id, detail, date, feedback, customer_since, a..."


## Llama 2 chat 모델 불러오기

In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import numpy as np

In [None]:
llama_2 = "meta-llama/Llama-2-7b-chat-hf"
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(llama_2)

# Load the model with quantization
llm = HuggingFacePipeline.from_model_id( # Langchain에서도 모델 생성을 도와줄 Pipeline을 지원한다.
    model_id=llama_2,
    task="text-generation",
    model_kwargs={
        "quantization_config": BitsAndBytesConfig(
            load_in_4bit=True, # 4bit로 quantization하여 모델을 불러온다.
            bnb_4bit_compute_dtype=torch.float16
        )
    },
    pipeline_kwargs={
        "max_new_tokens": 1024,
        "do_sample": True,
        "temperature": 0.01,
        "top_p": 0.9,
        "repetition_penalty": 1.15 # 반복을 통제할 수 있다.
    }
)

## 사용 함수

In [None]:
import gensim.downloader

In [None]:
# function 모음
def get_user_query():
  user_query = input("Enter your request: ")
  return user_query

# llm의 답에서 키워드를 뽑아줌
def keyword_extraction(text:str):
  # total_count = re.findall('[\d]+', text)
  key_list = []
  for i in range(10):
    start = f'{i+1}. '
    if i < 9:
      end = f'{i+2}. '
      keyword = text[text.find(start)+len(start):text.rfind(end)-1]
    else:
      end = '\n\n'
      keyword = text[text.find(start)+len(start):text.rfind(end)]
    key_list.append(keyword)
  return key_list

# metadata 서칭 함수
def metadata_search(keywords:list):
    query = ' '.join(keywords)
    ds_50 = DocSim(model_50)
    sim_score_50 = ds_50.calculate_similarity(query, [x for x in metadata['title']]) # metadata needs to be defined earlier in dataframe format (for langchain)
    selected_metadata = []
    for i in sim_score_50:
        selected_metadata.append(i['Data'])
    return selected_metadata # return 3 targets with similarity score in higest order

def print_search_result(result:list):
    result_dict = {result[0]:metadata.loc[metadata['title'] == result[0], 'owner'].values[0], result[1]:metadata.loc[metadata['title'] == result[1], 'owner'].values[0], result[2]:metadata.loc[metadata['title'] == result[2], 'owner'].values[0]}
    print(f'''The following is the most likely results that may contain the data you need.
          1. {result[0]} by {result_dict[result[0]]}
          2. {result[1]} by {result_dict[result[1]]}
          3. {result[2]} by {result_dict[result[2]]}
          ''')
    return result_dict

# user가 선택하는 데이터를 뽑아내는 함수
def user_selection(phase1_result:dict):
    while True:
        selection = input('''Please select the data you want to generate the issue with DataSquare LLM! (Select number): ''')
        try:
            number = int(selection)
            title = list(phase1_result)[number-1]
            owner = phase1_result[title]
            result = [title, owner]
            break
        except:
            print('Please type in the number only. (eg. 1 or 2)')
    return result

## 모델 사용 프롬프트

In [None]:
phase1_prompt = """From the given user query, extract 10 keywords that represent the data that user needs.

### User Query: {user_query}

### Response:
"""

phase2_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Fill out the 'Request Template' based on the following input with user request and relevant datasets. The 'Request Template' is sent from the user to the relevant datasets ower.

### Input:
User Request: {user_query}

Relevant Dataset: {dataset_1}

'Request Template'
To: (format: Dataset owner)
From: (format: User name, Team name)
Subject:
Request detail:
Data usage purpose:
Data file format (eg. .csv, .jpeg, .py, .json, etc.):
Data period: (format: YYYY-MM-DD)
Data example:

### Response:
"""

## LangChain 코드

In [None]:
from langchain import PromptTemplate

In [None]:
# chain 1 (유저 쿼리 받고 키워드 추출단계)
from langchain import PromptTemplate
prompt1 = PromptTemplate.from_template(phase1_prompt)
prompt2 = PromptTemplate.from_template(phase2_prompt)
query = "I’m Emily, a customer service operations manager. I am managing a project aimed at reducing customer complaints by 15% over the next quarter. To support this initiative, I need access to customer service data for Q1 to analyze complaint trends and identify key areas for service improvement."

phase1 = prompt1 | llm | keyword_extraction | metadata_search
output_phase1 = phase1.invoke({"user_query": query})

final_data_title = output_phase1[0]
final_data_owner = metadata.loc[metadata['title'] == final_data_title, 'owner'].values[0]

phase2 = prompt2 | llm
output_phase2 = phase2.invoke({'user_query':query, 'dataset_1':f"{final_data_title} owned by {final_data_owner}"})

In [None]:
response_final = output_phase2[output_phase2.find('### Response:')+len('### Response:'):]
print(response_final)

To: Marketing Team

From: Emily, Customer Service Operations Manager

Subject: Request for Customer Feedback Data for Project Complaint Reduction Initiative

Request Detail: We are in need of accessing the customer feedback data collected through product launch surveys conducted by your team during Q1. This information will be used to analyze trends and identify key areas for service improvement to reduce customer complaints by 15% over the next quarter.

Data Usage Purpose: To evaluate customer satisfaction levels and identify areas where improvements can be made to minimize complaints.

Data File Format: Please provide the data in a.csv format.

Data Period: Can you provide the data for Q1 (2023-01-01 - 2023-03-31)?

Data Example: An example of the data would be:

| Survey ID | Customer Name | Age Group | Gender | Product Launch Experience | Overall Satisfaction Rating |\n| --- | --- | --- | --- | --- | --- |\n| 1 | John Doe | 35 | Male | Positive | 4/5 |\n| 2 | Jane Smith | 27 | Female | Neutral | 3/5 |\n| 3 | Bob Johnson | 62 | Male | Negative | 2/5 |\n\nPlease let me know if there are any issues or concerns regarding this request.