In [9]:
import os
import json
import requests
import re
import numpy as np
import pickle
import gzip
from bs4 import BeautifulSoup
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
def crawler_page_urls():
    """# 爬取所有2866个连接 """
    urls0 = [f"https://gwins.org/cn/milesguo/list_2_{i}.html" for i in range(1,73)]
    urls1 = []
    for url in tqdm(urls0):
        response = requests.get(url)
        if response.status_code == 200:    
            soup = BeautifulSoup(response.content, 'html.parser')
            html_doc = soup.get_text() 
            link_string = '\n'.join([str(link) for link in soup.find_all('a')])
            pattern = r"/cn/milesguo/[\w/]+\.html"
            matches = re.findall(pattern, link_string)
            urls2 = [f"https://gwins.org{x}" for x in matches]
            urls1 += urls2
            print(len(urls1))
        else:
            print(f"无法获取页面{url}，HTTP状态码：{response.status_code}")
    return urls1

def download_documents():
    """# 爬取所有2866个文章，并保存为文档""" 
    urls1 = crawler_page_urls() # 爬取所有2866个连接
    out_folder = "./txts"
    if not os.path.isdir(out_folder): 
        os.mkdir("./txts")
    for url in tqdm(urls1):
        pattern = r'\d+'
        id = re.search(pattern, url).group() # 获取网页编号
        response = requests.get(url)
        if response.status_code == 200:    
            soup = BeautifulSoup(response.content, 'html.parser')
        else:
            print(f"无法获取页面{url}，HTTP状态码：{response.status_code}")
            continue
        html_doc = soup.get_text()  # 获取网页中的纯文本内容
        html_doc = re.sub(r'\s+', ' ', html_doc) # 去掉多余空格
        
        file_path = os.path.join(out_folder, f"{id}.txt")
        with open(file_path, "w") as f:
            f.write(html_doc) # 保存

In [5]:
def extract_titles(input_dir="./txts/", out_file="titles.json"):
    """提取每个文档的标题，并按照编号整理到一个json文件中"""
    files = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]
    dict1 = dict()
    for in_file in tqdm(files):
        id = os.path.basename(in_file).split(".")[0]
        with open(in_file, "r") as f:
            txt = f.read()
        pattern1 = r'^(.*?)首页'
        match = re.match(pattern1, txt)
        if match: 
            title = match.group(1)
            title = title.replace("\n", " ")
        else:
            title = "unknown title"
        dict1[id] = title
        #print(title)
    with open(out_file, "w") as f:
        json.dump(dict1, f)

def load_titles(file="title.json"):
    """读取标题文件"""
    with open(file, "r") as f:
        dict1 = json.load(f)
    return dict1
    
def load_data_to_paragraphs(file):
    """ 将长文档分解成1000字以内短文档. 
    因为openai sentence embedding ada 002 8000 input token, 最大2000汉字
    但是逼近2000后语义编码效果会下降
    """
    with open(file, "r") as f:
        data = f.read()
    pattern1 = r'^.*?内容梗概: '
    pattern2 = r' 友情链接：Gnews \| Gclubs \| Gfashion \| himalaya exchange \| gettr \| 法治基金 \| 新中国联邦辞典 \| $'
    data = re.sub(pattern1, "", data)
    data = re.sub(pattern2, "", data)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    txts = text_splitter.split_text(data)
    return txts

def sentence_embedding_batch(txts, id):
    """将list of text编码为sentence embedding 1536维"""
    l1 = []
    #embs = get_embeddings(txts, engine="text-embedding-ada-002") # old api
    response = openai_client.embeddings.create(input = txts, model="text-embedding-ada-002")
    response = json.loads(response.json())["data"]
    embs = [x["embedding"] for x in response]
    for i, txt in enumerate(txts):
        label = f"{id}-{i}"
        emb = embs[i]
        l1.append((label, txt, emb))
    return l1

def encoding_file(in_file, output_dir):
    """将文档.txt文件分割并编码为sentence embedding，压缩保存为同名.npz文件"""
    id = os.path.basename(in_file).split(".")[0]
    out_file = os.path.join(output_dir, id+".npz")
    txts = load_data_to_paragraphs(in_file)
    packs = sentence_embedding_batch(txts, id)
    serialized_data = pickle.dumps(packs)
    compressed_data = gzip.compress(serialized_data)
    with open(out_file, "wb") as file:
        file.write(compressed_data)

def encoding_files(input_dir = "./txts/", output_dir = "./emb"):
    """批量将文件夹下txt文件编码为同名 embedding文件，2866个文件需要 openai 3美元"""
    files = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for i, in_file in enumerate(tqdm(files)):
        encoding_file(in_file, output_dir)

def decoding_file(file):
    with open(file, "rb") as f:
        compressed_data = f.read()
    decompressed_data = gzip.decompress(compressed_data)
    l1 = pickle.loads(decompressed_data)
    return l1


In [10]:
#download_documents() # 爬虫
encoding_files(input_dir = "./txts/", output_dir = "./emb") # 编码，保存到文件
extract_titles(input_dir="./txts/", out_file="titles.json") # 提取标题，保存到文件

100%|███████████████████████████████████████████████████████████████████████████████| 2866/2866 [46:16<00:00,  1.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2866/2866 [00:06<00:00, 438.59it/s]
