In [None]:
!curl -LsSf https://astral.sh/uv/install.sh | sh
!uv init
!uv add torch
!uv add transformers
!uv add numpy
!uv add tqdm
!uv add openai

In [None]:
from datasets import load_dataset

data = load_dataset("rotten_tomatoes")
data
print(data["train"][0, -1])

In [None]:
from transformers import pipeline
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
  """创建并打印分类报告"""
  performance = classification_report(
      y_true,
      y_pred,
      target_names=["Negative Review", "Positive Review"]
  )
  print(performance)

# 我们的 Hugging Face 模型路径
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
# 将模型加载到 pipeline 中
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    top_k=None,
    device="cuda:0"
)

# 运行推理
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "text")), total=len(data['test'])):
  negative_score = output[0]["score"]
  positive_score = output[0]["score"]
  assignment = np.argmax([negative_score, positive_score])
  y_pred.append(assignment)

evaluate_performance(data["test"]["label"], y_pred)


In [None]:
# 利用嵌入向量的分类任务
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

# 加载模型
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# 将文本转换为嵌入向量
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)
train_embeddings.shape

# 基于训练嵌入向量构建逻辑回归模型
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

# 预测未见过的样本
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

label_embeddings = model.encode(["A negative review", "A positive review"])
# 为每个文档找到最匹配的标签
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)
evaluate_performance(data["test"]["label"], y_pred)


In [None]:
# 使用生成模型进行文本分类
pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="cuda:0"
)

# 准备数据
prompt = "Is the following sentence positive or negative"
data = data.map(lambda example: {"t5": prompt + example['text']})
data

In [None]:
# 运行推理
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
  text = output[0]["generated_text"]
  y_pred.append(0 if text == "negative" else 1)

In [None]:
evaluate_performance(data["test"]["label"], y_pred)

In [None]:
import openai

# 创建客户端
client = openai.OpenAI(
    api_key="xxx",
    base_url="https://vip.apiyi.com/v1"
)

def chatgpt_generation(prompt, document, model="gpt-3.5-turbo-0125"):
  """基于提示词和输入文档生成输出"""
  messages=[
      {
          "role": "system",
          "content": "You are a helpful assistant."
      },
      {
          "role": "user",
          "content": prompt.replace("[DOCUMENT]", document)
      }
  ]
  chat_completion = client.chat.completions.create(
      messages=messages,
      model=model,
      temperature=0
  )

  return chat_completion.choices[0].message.content

# 定义一个基础提示词模板
prompt = """Predict whether the following document is a positive or negative movie review:
[DOCUMENT]
If it is positive return 1 and if it is negative return 0. Do not give any other answers.
"""

# 使用 GPT 预测目标
document = "unpretentious, charming, quirky, original"
chatgpt_generation(prompt, document)

In [None]:
# 如果你想节省模型调用成本, 可以跳过这一步
predictions = [
    chatgpt_generation(prompt, doc) for doc in tqdm(data["test"]["text"])
]

# 提取预测结果
y_pred = [int(pred) for pred in predictions]

# 评估性能
evaluate_performance(data["test"]["label"], y_pred)