In [38]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

import pdfplumber
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from pathlib import Path

embeddings = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-chinese")

"""
返回以不同pdf文件分隔的embedding db
传入参数：pdf文件路径列表，list[str]
返回：embedding FAISS db
"""
def get_pdf_embedding(pdf_path_list):
    list_of_documents = []
    for idx, pdf_path in enumerate(pdf_path_list):
        with pdfplumber.open(pdf_path) as pdf:
            path = Path(pdf_path)
            filename = path.name
            text = ""
            for i, page in enumerate(pdf.pages):
                text += page.extract_text()
            list_of_documents.append(Document(page_content=text, metadata=dict(file_name=filename, pdf_num=idx)))
    return FAISS.from_documents(list_of_documents, embeddings)     


"""
返回以页数分隔的embedding db
传入参数：pdf文件路径列表，list[str]
返回：embedding FAISS db
"""
def get_pdf_embedding_paging(pdf_path_list):
    list_of_documents = []
    for idx, pdf_path in enumerate(pdf_path_list):
        with pdfplumber.open(pdf_path) as pdf:
            path = Path(pdf_path)
            filename = path.name
            for i, page in enumerate(pdf.pages):
                curr_text = page.extract_text()
                list_of_documents.append(Document(page_content=curr_text[:len(curr_text)//2], metadata=dict(file_name=filename, pdf_num=idx, page_num=i, chunk_num=0)))
                list_of_documents.append(Document(page_content=curr_text[len(curr_text)//2:], metadata=dict(file_name=filename, pdf_num=idx, page_num=i, chunk_num=1)))
    return FAISS.from_documents(list_of_documents, embeddings)        


In [39]:
pdf_path_list = ["./pdfs/公正--该如何做是好.pdf", "./pdfs/功利主义.pdf", "./pdfs/诉公共卫生部案.pdf"]


In [40]:
get_pdf_embedding_paging(pdf_path_list).save_local("faiss_pdf_embedding_paging")


In [41]:
get_pdf_embedding(pdf_path_list).save_local("faiss_pdf_embedding")

In [42]:
# results_with_scores = db.similarity_search_with_score("同性恋起诉公共卫生部案")
# for doc, score in results_with_scores:
#     print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}\n\n")

In [43]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
"""
分割列表utils函数
"""
def chunk_list(input_list, chunk_size=50):
    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]

"""
获取视频title utils函数
"""
def get_title(video_id):
    url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(url)
    return yt.title

"""
返回YouTube字幕的embedding FAISS db，分页
传入参数：YouTube Video Id列表，list[str]
返回：embedding FAISS db
"""
def get_youtube_embedding_paging(video_id_list):
    list_of_documents = []
    for video_id in video_id_list:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        for transcript in transcript_list:
            if transcript.language != 'English (auto-generated)':
                continue
            transcript_list = transcript.translate('zh-Hans').fetch()
            chunked_lists = chunk_list(transcript_list, 25)
            for t in chunked_lists:
                text = "".join(obj['text'] for obj in t)
                time = t[0]['start']
                list_of_documents.append(Document(page_content=text, metadata=dict(video_name=get_title(video_id), start_time=time, video_id=video_id)))
    return FAISS.from_documents(list_of_documents, embeddings)

"""
返回YouTube字幕的embedding FAISS db，不分页
传入参数：YouTube Video Id列表，list[str]
返回：embedding FAISS db
"""
def get_youtube_embedding(video_id_list):
    list_of_documents = []
    for video_id in video_id_list:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        for transcript in transcript_list:
            if transcript.language != 'English (auto-generated)':
                continue
            transcript_list = transcript.translate('zh-Hans').fetch()
            text = "".join(obj['text'] for obj in transcript_list)
            list_of_documents.append(Document(page_content=text, metadata=dict(video_name=get_title(video_id))))
    return FAISS.from_documents(list_of_documents, embeddings)

In [44]:
# HARDCODED: Harvard课程 https://www.youtube.com/playlist?list=PL30C13C91CFFEFEA6
video_id_list = ['kBdfcR-8hEY', 
                 '0O2Rq4HJBxw', 
                 'Qw4l1w0rkjs', 
                 'MGyygiXMzRk', 
                 '8yT4RZy1t3s', 
                 '8rv-4aUbZxQ', 
                 'KqzW0eHzDSQ', 
                 'VcL66zx_6No', 
                 'AUhReMT5uqA', 
                 'MuiazbyOSqQ', 
                 'iOotE9_OGGs', 
                 'EzD9P-9sj4M']

In [45]:
get_youtube_embedding(video_id_list).save_local("faiss_youtube_embedding")

In [46]:
get_youtube_embedding_paging(video_id_list).save_local("faiss_youtube_embedding_paging")

In [47]:
# results_with_scores = db2.similarity_search_with_score("个人权利与政府权力")
# for doc, score in results_with_scores:
#     print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}\n\n")

In [48]:
print("done!")

done!
