In [1]:
import torch
from modelscope.utils.hf_util import AutoModelForCausalLM, AutoTokenizer,snapshot_download

import time
import os
import pandas as pd
import numpy as np
import glob
import xml.etree.ElementTree as ET
import json

download_path = "D:\\Jupyter_Code\\model"
start = 0
standard_answer = "task.csv"
model_dir_list = ["D:\\Jupyter_Code\\model\\Yi-34B-Chat-4bits", 
                  "D:\Jupyter_Code\model\LLM-Research\\Llama-3.2-3B",
                  "D:\\Jupyter_Code\model\\Phi-3-mini-4k-instruct"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def Yi_text_generation(input, model_index = 1, max_new_tokens = 230, temperature = 0.3, top_k = 3, do_sample = True, print = True):
    '''
    We package this LLM into one function. Just call this function, and it will give the result(response).
    Models have been installed in D:\Jupyter_Code\model. 
    There are three models: Yi-6B, Yi-34B(cannnot run in this computer), and Yi-34B-Chat-4bits.
    The format of input depends on your mode. If mode is 0, input should be a str which the model will generate the text after it;
    if mode is 1, input should be the format of chat. We highly recommend you use mode 0.
    Other parameters are used to give LLM. Details can be seen at https://zhuanlan.zhihu.com/p/653926703.
    Parameters maybe not important when using GPT-4, but it is important when using open-source LLM. 
    The best parameter depends on your task. You may try some times to get the optimum.
    To make it more convenient to use, we give the default parameters. You can set it when you use this function.

    我们将调用大语言模型的任务封装在这个函数里。只需要调用这个函数，就会给出返回结果。
    需要调用的模型已被安装在D:\Jupyter_Code\model文件夹下。目前文件夹内有三个模型：Yi-6B, Yi-34B（本电脑显卡配置无法运行此模型），Yi-34B-Chat-4bits。
    输入的参数input格式与mode有关。如果mode=0，您需要传入待补全的字符串，模型进行文本补全任务；如果mode=1，您需要传入待生成的对话，模型进行对话任务。
    我们强烈您尽可能使用文本生成工作（即mode=0）。
    其他参数是用于给大语言模型的，详细内容可以在这个知乎专栏看到https://zhuanlan.zhihu.com/p/653926703。
    参数可能对于使用GPT-4并不重要，但使用开源大模型时是不可或缺的重要一环。最佳参数与任务有关，您可能需要尝试多次才能找到最合适的结果。
    为了更方便地使用它，我们有预设的参数。您可以在使用该模型时更改它。
    '''
    global start, model_dir, model, tokenizer, model_dir_list
     
    if start == 0:
        start = 1
        model_dir = model_dir_list[model_index]
        if model_index == 0:

            model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="cuda", torch_dtype="auto",
                                                         offload_folder="offload_folder", trust_remote_code=True)

            #model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
        elif model_index == 1:
            model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
        elif model_index == 2:
            model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
        
        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True, download_path = download_path)
    if model_index <= 0:
        inputs = tokenizer(input, return_tensors="pt")
        outputs = model.generate(inputs.input_ids.cuda(), 
                                 temperature = temperature,
                                 top_k = top_k,
                                 do_sample = do_sample,
                                 max_new_tokens = max_new_tokens
                                 )
    elif model_index >= 1:
        model_inputs = tokenizer(input, return_tensors="pt").to("cuda")
        inputs = tokenizer.encode(input, return_tensors="pt")
        attention_mask = torch.ones(inputs.shape, dtype = torch.long, device="cuda")
        outputs = model.generate(model_inputs.input_ids, 
                                 temperature = temperature,
                                 top_k = top_k,
                                 do_sample = do_sample,
                                 max_new_tokens = max_new_tokens,
                                 attention_mask = attention_mask,
                                 pad_token_id = tokenizer.eos_token_id,
                                 )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if print == True:
        print(response)
    return response

In [3]:
def test_LLM(message, model_index):
    message = message + "\nYour answer:\n###\n"
    str_return = Yi_text_generation(message, model_index = model_index, print = False)
    str_return = str_return[str_return.index("###") + 3:]
    return str_return

In [4]:
def readExtractionPrompt():
    fileName = "C:\\Users\Administrator\\Desktop\\3_opensource_llm\\extractionPromptPro.txt"
    f = open(fileName, "r")
    lines = f.readlines()
    f.close()

    prompt = ""
    for line in lines:
        prompt += line    
    return prompt

In [5]:
def gptProcessReport_llm(rootPath,targetPath,model_index):
    start_time = time.time()
    xml_files = []
    for dirpath, dirnames, filenames in os.walk(rootPath):
        for filename in filenames:
            if filename.endswith('.xml'):
                xml_files.append(os.path.join(dirpath, filename))
    
    for xml_file in xml_files:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        case_id = root.attrib['CaseID']
        summary = root.find('.//SUMMARY')  # 使用 find() 查找 SUMMARY 标签
        # 获取 SUMMARY 标签的文本内容
        description = summary.text.strip() if summary is not None else "No summary found"
        print(case_id)

        prompt = "You should help me process a car accident description. You should analyse each sentence in the description. Once you found a sentence that contains impact actions, then drop all the sentences after." + \
             "Output the processed description." + \
             "The accident description is : " + description
        prompt_1 = readExtractionPrompt() + prompt
        response = test_LLM(prompt_1,model_index)
        print(response)
        # print(pres)
        try:
            filepath = f"{targetPath}/{case_id}.txt"
            f = open(filepath, "w")
            f.write(response)
        except:
            print('dont write txt')
    end_time = time.time()
    print("Time used:", end_time - start_time)


In [6]:
gptProcessReport_llm("C:\\Users\\Administrator\\Desktop\\3_opensource_llm\\raw_reports","C:\\Users\\Administrator\\Desktop\\3_opensource_llm\\llama_3_2_3b",1)

2005005289123


Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.54s/it]




2005002585683


2005045587801


2005004112761


2005006445022


200501269400


2005004112521

import json
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouet

In [7]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.current_device())
print(torch.cuda.is_available())
import modelscope
print(modelscope.__version__)

2.5.1+cu121
12.1
0
True
1.21.1
