In [None]:
!pip install grobid-client langchain openai faiss-cpu PyPDF2 tiktoken chromadb

In [None]:
import openai
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
from langchain.docstore.document import Document
from typing import (
    AbstractSet,
    Any,
    Callable,
    Collection,
    Dict,
    Generator,
    List,
    Literal,
    Mapping,
    Optional,
    Set,
    Tuple,
    Union,
)

from langchain.prompts.prompt import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from pprint import pprint

In [None]:



def read_kakao(file_path):
    file_type = check_export_file_type(file_path)
    return parse(file_type, file_path)






In [None]:
import logging
from typing import List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings

logger = logging.getLogger(__name__)


import re
from datetime import datetime
from typing import List, Dict
# kakaotalk 메시지 중 날짜표현 패턴
# 이를 사용하여 파일이 추출된 소스와 메시지 구분과 
kakaotalk_datetime_pattern_dict = {'window_ko_date': "-{15} [0-9]{4}년 [0-9]{1,2}월 [0-9]{1,2}일 \S요일 -{15}",
                                'window_ko_time': "((\[)([^\[])+(\])) ((\[오)\S [0-9]{1,2}:[0-9]{1,2}(\]))",
                                'android_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}",
                                'android_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",
                                    }


def _str_to_datetime(file_type, text):
    kakaotalk_strptime_pattern_dict = {'ko': '%Y년 %m월 %d일 %p %I:%M',
                                        'en': '%B %d, %Y, %I:%M %p',
                                        }

    language = file_type.split('_')[1]
    if language == 'ko':
        text = text.replace('오전', 'AM')
        text = text.replace('오후', 'PM')

    text_dt = datetime.strptime(text, kakaotalk_strptime_pattern_dict[language])
    return text_dt


def parse(file_type, file_path, encoding, datetime_pattern_dict=kakaotalk_datetime_pattern_dict):
    """
    Parsing the text from a kaotalk_export_file.
    This parser divide messages based on datetime_pattern.
    
    Parameters
    ----------
    file_type: string
        one of among 'window_ko', 'android_ko' or 'android_en'

    file_path: string

    datetime_pattern_dict: dict
        datetime_pattern used i kaotalk_export_file

    Returns
    -------
    msgs: list
        The messages are list of dictionary.
        Each dictionary compose of the informtion of each message.
        And it has keys, 'datetime,'user_name' and 'text'.
    """

                                        
    msgs = []

    if file_type == 'window_ko':     # window
        date_pattern = datetime_pattern_dict['window_ko_date']
        time_pattern = datetime_pattern_dict['window_ko_time']

        with open(file_path, encoding=encoding) as file:
            # 줄바꿈되어있는 경우도 묶어주기 위해 buffer 사용
            buffer = ''
            date = ''

            for line in file:
                # window파일의 데이트str(--------------- 2020년 6월 28일 일요일 ---------------)이거나 시간 str([김한길] [오후 2:15] htt)이면
                if re.match(date_pattern, line) or re.match(time_pattern, line):
                    # buffer가 time_pattern으로 시작하는 경우만 추가해주기
                    if re.match(time_pattern, buffer):  
                        buffer_tokens = buffer.split(']', maxsplit=2)
                        user_name = buffer_tokens[0].replace('[', '').strip()
                        time = buffer_tokens[1].replace('[', '').strip()
                        my_datetime = _str_to_datetime(file_type, f"{date} {time}")
                        text = buffer_tokens[2].strip()
                        
                        msgs.append({'datetime': my_datetime,
                                        'user_name': user_name,
                                        'text': text
                        })

                    if re.match(date_pattern, line):  # window파일의 데이트str이면
                        date = line.replace('-', '').strip().rsplit(" ", 1)[0]
                        buffer = ''
                    else:  #  window파일의 시간 str이면
                        buffer = line

                else:
                    buffer += line

    else: # android
        datetime_pattern = datetime_pattern_dict[file_type]
        msg_exist_check_pattern = datetime_pattern + ",.*:"

        with open(file_path, encoding=encoding) as file:
            # 줄바꿈되어있는 경우도 저장하기 위해 buffer 사용
            buffer=''
            for line in file:
                if re.match(datetime_pattern, line):
                    if re.match(msg_exist_check_pattern, buffer):
                        
                        temp_01_2_tokens = buffer.split(" : ", maxsplit=1)
                        temp_0_1_tokens = temp_01_2_tokens[0].rsplit(",", maxsplit=1)

                        my_datetime = temp_0_1_tokens[0].strip()
                        my_datetime = _str_to_datetime(file_type, my_datetime)
                        user_name = temp_0_1_tokens[1].strip()
                        text = temp_01_2_tokens[1].strip()
                        msgs.append({'datetime': my_datetime,
                                    'user_name': user_name,
                                    'text': text
                        })

                    buffer = line
                else:
                    buffer += line

    parsed_msgs = extract(msgs)
    return parsed_msgs

def extract(msgs: List[Dict[str,str]]):
    extracted_msgs = []
    exclude_msg_pattern = r'^(이모티콘|사진|동영상|\.|,|)$'

    i = 0
    while i < len(msgs):
        m = msgs[i]
        nick = m.get('user_name')
        datetime = m.get('datetime')
        text = m.get('text','').strip()

        if (nick is None) or (datetime is None) or text == '' or re.match(exclude_msg_pattern, text):
            i += 1
            continue

        # If current nick equals to the next nick, combine the texts and continue
        if i + 1 < len(msgs) and msgs[i + 1].get('user_name') == nick:
            # If the text ends with '\n', remove it
            if text.endswith('\n'):
                text = text[:-1]
            
            msgs[i + 1]['text'] = text + " " + msgs[i + 1]['text'].strip()
            i += 1
            continue

        extracted_msg = f"{nick} : {text}"
        extracted_msgs.append(extracted_msg)
        i += 1

    return '\n'.join(m for m in extracted_msgs)


class KakaoTextLoader(BaseLoader):
    """Load kakao text files.


    Args:
        file_path: Path to the file to load.

        encoding: File encoding to use. If `None`, the file will be loaded
        with the default system encoding.

        autodetect_encoding: Whether to try to autodetect the file encoding
            if the specified encoding fails.
    """

    def __init__(
        self,
        file_path: str,
        encoding: Optional[str] = None,
        autodetect_encoding: bool = False,
    ):
        """Initialize with file path."""
        self.file_path = file_path
        self.encoding = encoding
        self.autodetect_encoding = autodetect_encoding
        self.kakao_file_type = self.check_kakao_export_file_type(file_path)
        

    @staticmethod
    def check_kakao_export_file_type(file_path,
                                datetime_pattern_dict=kakaotalk_datetime_pattern_dict):
        """
        Check the device type and language of kakaotalk_export_file.
        It is done based on datetime patterns in file
        
        Parameters
        ----------
        file_path: string
    
        datetime_pattern_dict: dict
            datetime_pattern used i kaotalk_export_file
    
        Returns
        -------
        file_type: string
            one of among 'window_ko', 'android_ko' or 'android_en'
        """
    
        # 파일의 두 번째 줄(저장한 날짜 : /Date Saved : ) 부분의 날짜형식으로 구분
        # kakaotalk_include_date_pattern_dict = {'pc_ko': "([0-9]){4}-([0-9]){1,2}-([0-9]){1,2} ([0-9]){1,2}:([0-9]){1,2}",
        #                             'mobile_ko': "([0-9]){4}년 ([0-9]){1,2}월 ([0-9]){1,2}일 오\S ([0-9]){1,2}:([0-9]){1,2}",
        #                             'mobile_en': "([A-z])+ ([0-9]){1,2}, ([0-9]){4}, ([0-9]){1,2}:([0-9]){1,2} \SM",}
        
        with open(file_path, 'r') as f:
            for counter in range(5):
                line = f.readline()
                if not line: break
    
                for file_type, pattern in datetime_pattern_dict.items():
                    if re.search(pattern, line):
                        
                        return '_'.join(file_type.split('_')[:2])
        raise ValueError(f'Error: Cannot know the device type and language of the file.\nPlease check the file is a kakaotalk export file or the export enviroment is in among {str(list(kakaotalk_include_date_pattern_dict.keys()))}')

    
    def load(self) -> List[Document]:
        """Load from file path."""
        text = ""
        try:
            text = parse(self.kakao_file_type, self.file_path, encoding=self.encoding)
        except UnicodeDecodeError as e:
            if self.autodetect_encoding:
                detected_encodings = detect_file_encodings(self.file_path)
                for encoding in detected_encodings:
                    logger.debug("Trying encoding: ", encoding.encoding)
                    try:
                        with open(self.file_path, encoding=encoding.encoding) as f:
                            text = f.read()
                        break
                    except UnicodeDecodeError:
                        continue
            else:
                raise RuntimeError(f"Error loading {self.file_path}") from e
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e
        # TODO: algin with chat datetime in metadata
        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]

In [None]:
async def split(docs:List[Document], chunk_size):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, separators=["\n\n", "\n", " ", ""])
    return text_splitter.split_documents(docs)

In [None]:
model_token_mapping = {
    "gpt-4": 8192,
    "gpt-4-0314": 8192,
    "gpt-4-32k": 32768,
    "gpt-4-32k-0314": 32768,
    "gpt-3.5-turbo": 4096,
    "gpt-3.5-turbo-0301": 4096,
    "text-ada-001": 2049,
    "ada": 2049,
    "text-babbage-001": 2040,
    "babbage": 2049,
    "text-curie-001": 2049,
    "curie": 2049,
    "davinci": 2049,
    "text-davinci-003": 4097,
    "text-davinci-002": 4097,
    "code-davinci-002": 8001,
    "code-davinci-001": 8001,
    "code-cushman-002": 2048,
    "code-cushman-001": 2048,
}
# TODO: prompt number of token
def get_batch_size(model_name, k, chunk_size):
    return (chunk_size * k) // model_token_mapping[model_name]
    
    
# TODO: max prompt size
# InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 9224 tokens (8968 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.
chunk_size = 1500
k = 10
model_name = "text-davinci-003"
batch_size = get_batch_size(model_name, k, chunk_size)
print(batch_size)


loader = KakaoTextLoader('./data/kakao.txt', encoding='utf8')
raw_docs = loader.load()
test_docs = await split(raw_docs, chunk_size)
len(test_docs)

In [None]:
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)
embeddings

In [None]:
db.save_local('./data/faiss_kakao')
new_db = FAISS.load_local("./data/faiss_kakao", embeddings)

In [None]:
retriever = db.as_retriever(search_kwargs={'k':k})
docs = retriever.get_relevant_documents(topic)

In [None]:
# default model text-davinci-003
llm = OpenAI(batch_size=batch_size, model_name=model_name, verbose=True)

In [None]:
%%time

def get_summary_map_reduce_prompt(topic):
    summary_prompt_template = """Summarize the chat conversation that is in the text below, so that the content presented in the topic above is well represented.
    You must obtain and summarize the necessary data from text so that the content written in topic can be well represented.
    
    The CONVERSATION CONTEXT format is 'speaker: message'.    
    For example, in 'minwook: my name is minwook', the conversation content is 'my name is minwook'. 
    The content of the conversation is the most important.
    
    !IMPORTANT Even if you can't analyze it, guess based on your knowledge. answer unconditionally.
    
    text: {text}
    
    """
    prefix_summary = f"The topic is '{topic}'." 
    suffix_summary = "CONCISE SUMMARY IN 3000 WORDS IN ENGLISH:"
    template = prefix_summary + summary_prompt_template + suffix_summary
    
    return PromptTemplate(template=template, input_variables=["text"])
    



map_reduce_prompt = get_summary_map_reduce_prompt('The major story that happened between Kanghyeon and Soyeon, who are lovers.')
chain = load_summarize_chain(
        llm=llm, 
        chain_type="map_reduce",
        map_prompt=map_reduce_prompt, 
        combine_prompt=map_reduce_prompt,
        verbose=True,
)
summary = chain({"input_documents": docs}, return_only_outputs=True)
summary

In [None]:
%%time
# topic='연인관계인 강현과 소연이의 둘 사이에 일어났던 주요 사건들과 시간에 따른 서로의 감정상태'
def get_summary_refine_prompt(topic='The major story that happened between Kanghyeon and Soyeon, who are lovers.'):
    summary_prompt_template = """Summarize the chat conversation that is in the text below, so that the content presented in the topic above is well represented.
    You must obtain and summarize the necessary data from text so that the content written in topic can be well represented.
    
    The CONVERSATION CONTEXT format is 'year month day time, speaker: message'.    
    For example, in '2000, May 3, 3:00 AM, A: Hello', the conversation content is Hello. 
    The content of the conversation is the most important.
    
    !IMPORTANT Even if you can't analyze it, guess based on your knowledge. answer unconditionally.
    
    text: {text}
    
    """
    prefix_summary = f"The topic is '{topic}'." 
    suffix_summary = "CONCISE SUMMARY IN 3000 WORDS IN ENGLISH:"
    template = prefix_summary + summary_prompt_template + suffix_summary
    
    PROMPT = PromptTemplate(template=template, input_variables=["text"])


refine_prompt = get_summary_map_reduce_prompt('The major story that happened between Kanghyeon and Soyeon, who are lovers.')
chain = load_summarize_chain(
        llm=llm, 
        chain_type='refine',
    
        verbose=True,
)
summary = chain({"input_documents": docs}, return_only_outputs=True)
summary