In [2]:
import pandas as pd
import numpy as np

**已经检测完，没有空值/缺失值/非法值**

In [3]:
data = pd.read_csv("./Combined Data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53043 entries, 0 to 53042
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  53043 non-null  int64 
 1   statement   52681 non-null  object
 2   status      53043 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [4]:
data.status.value_counts()

status
Normal                  16351
Depression              15404
Suicidal                10653
Anxiety                  3888
Bipolar                  2877
Stress                   2669
Personality disorder     1201
Name: count, dtype: int64

通过status列生成新列“是否有精神健康问题”（每种心理问题都要抽，一共取2500条），这个新列除了normal是0外，其他心理问题都是1
先去除字符数<250 and > 1500的文本的行，然后按 statement 列文本长度排序，在文本长度降序排列的前提下取最长的前 1000 条（Normal 类别）和各其他类别最长的前 250 条

In [5]:
# 修改第一列列名为 id
data = data.rename(columns={"Unnamed: 0": "id"})

# 生成新列“是否有精神健康问题”，Normal 为 0，其他为 1
data["是否有精神健康问题"] = data["status"].apply(lambda x: 0 if x == "Normal" else 1)

# 计算文本长度并添加字符数在300到1000之间的过滤条件
data["statement_length"] = data["statement"].str.len()
# 修正多条件过滤的语法，使用&而非and，且每个条件需用括号括起
filtered_data = data[(data["statement_length"] > 300) & (data["statement_length"] < 1000)].copy()
print(f"过滤掉字符数<=1000或>=300的样本后，剩余样本数: {len(filtered_data)}")

# 按类别分别取文本最长的指定数量数据
# Normal 类别取文本最长的前 1000 条
normal_data = filtered_data[filtered_data["status"] == "Normal"].copy()
normal_data = normal_data.sort_values(by="statement_length", ascending=False).head(1000)  # 按长度降序排列，取前1000条
normal_data = normal_data.drop(columns=["statement_length"])  # 删除临时长度列
print(f"Normal 类别选取了 {len(normal_data)} 条样本")

# 获取所有非Normal的心理问题类别
other_categories = [cat for cat in filtered_data["status"].unique() if cat != "Normal"]
other_data_list = []

# 为每个非Normal类别取文本最长的前250条
for category in other_categories:
    category_data = filtered_data[filtered_data["status"] == category].copy()
    # 按长度降序排列，取前250条
    category_data = category_data.sort_values(by="statement_length", ascending=False).head(250)
    category_data = category_data.drop(columns=["statement_length"])  # 删除临时长度列
    other_data_list.append(category_data)
    print(f"{category} 类别选取了 {len(category_data)} 条样本")

# 合并所有数据
sampled_data = pd.concat([normal_data] + other_data_list)

# 验证总样本数
total_count = len(sampled_data)
print(f"总样本数: {total_count}")

# 只保留文本（statement 列）和目标标签（“是否有精神健康问题”列）
sampled_data = sampled_data[["statement", "是否有精神健康问题"]]

# 保存处理后的数据
sampled_data.to_csv("./processed_data.csv", index=False, encoding="utf-8")

print("数据处理完成，已保存至 processed_data.csv")
    

过滤掉字符数<=1000或>=300的样本后，剩余样本数: 17749
Normal 类别选取了 1000 条样本
Anxiety 类别选取了 250 条样本
Depression 类别选取了 250 条样本
Suicidal 类别选取了 250 条样本
Stress 类别选取了 250 条样本
Bipolar 类别选取了 250 条样本
Personality disorder 类别选取了 250 条样本
总样本数: 2500
数据处理完成，已保存至 processed_data.csv


In [None]:
# -*- coding: utf-8 -*-
import requests
import random
import json
import csv
from hashlib import md5
from typing import List

# -------------------------- 1. 配置百度翻译API参数 --------------------------
# 替换为你的百度翻译API密钥（必须填写，否则无法调用）
APPID = ""  # 你的AppID
APPKEY = ""  # 你的AppKey

# 语言方向：from_lang=源语言（auto=自动识别），to_lang=目标语言（zh=中文）
FROM_LANG = "auto"
TO_LANG = "zh"

# API接口地址
ENDPOINT = "http://api.fanyi.baidu.com"
PATH = "/api/trans/vip/translate"
API_URL = ENDPOINT + PATH


# -------------------------- 2. 翻译工具函数 --------------------------
def make_md5(s: str, encoding: str = "utf-8") -> str:
    """生成MD5哈希值（API签名必需）"""
    return md5(s.encode(encoding)).hexdigest()


def translate_text(text: str) -> str:
    """
    单条文本翻译函数
    :param text: 待翻译的原文（字符串）
    :return: 翻译后的中文结果（字符串）
    """
    # 处理空文本（避免API报错）
    if not text.strip():
        return ""

    # 生成API请求参数（salt=随机数，sign=签名）
    salt = random.randint(32768, 65536)
    sign = make_md5(APPID + text + str(salt) + APPKEY)

    # 构造请求体
    payload = {
        "appid": APPID,
        "q": text,
        "from": FROM_LANG,
        "to": TO_LANG,
        "salt": salt,
        "sign": sign
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}

    # 发送请求并解析结果
    try:
        response = requests.post(API_URL, params=payload, headers=headers, timeout=10)
        response.raise_for_status()  # 若HTTP状态码非200，抛出异常
        result = response.json()

        # 提取翻译结果（API返回格式：{"trans_result": [{"dst": "翻译结果"}]}）
        if "trans_result" in result:
            return result["trans_result"][0]["dst"]
        else:
            # 打印API错误信息（如密钥错误、额度不足）
            print(f"翻译失败（原文：{text}）：{result.get('error_msg', '未知错误')}")
            return ""

    except Exception as e:
        print(f"请求异常（原文：{text}）：{str(e)}")
        return ""


# -------------------------- 3. CSV文件处理（读取→翻译→保存） --------------------------
def process_csv(input_path: str = "test.csv", output_path: str = "final_data.csv"):
    """
    处理CSV文件：读取第一列文本并翻译，保存新CSV
    :param input_path: 输入CSV路径（默认test.csv）
    :param output_path: 输出CSV路径（默认final_data.csv）
    """
    # 步骤1：读取test.csv，提取第一列文本并记录所有行数据
    all_rows: List[List[str]] = []  # 存储原CSV的所有行（含表头）
    first_column_texts: List[str] = []  # 存储第一列的所有文本（待翻译）

    try:
        with open(input_path, mode="r", encoding="utf-8-sig", newline="") as f:
            csv_reader = csv.reader(f)
            for row in csv_reader:
                all_rows.append(row)  # 保存原行数据
                if row:  # 避免空行
                    first_col_text = row[0]  # 提取第一列文本
                    first_column_texts.append(first_col_text)
                else:
                    first_column_texts.append("")  # 空行补空字符串

        print(f"成功读取 {input_path}，共 {len(all_rows)} 行数据，待翻译 {len(first_column_texts)} 条文本")

    except FileNotFoundError:
        print(f"错误：未找到文件 {input_path}，请确认文件路径正确！")
        return
    except Exception as e:
        print(f"读取CSV失败：{str(e)}")
        return


    # 步骤2：批量翻译第一列文本
    translated_results: List[str] = []
    print("开始翻译...")
    for idx, text in enumerate(first_column_texts, 1):
        print(f"正在翻译第 {idx}/{len(first_column_texts)} 条：{text[:50]}...")  # 显示进度
        translated = translate_text(text)
        translated_results.append(translated)


    # 步骤3：保存为final_data.csv（新增列）
    try:
        with open(output_path, mode="w", encoding="utf-8-sig", newline="") as f:
            csv_writer = csv.writer(f)
            # 处理表头：原表头第一列后新增
            for row_idx, row in enumerate(all_rows):
                if row_idx == 0:  # 表头行
                    new_row = row + ["文本内容"]  # 新增翻译列表头
                else:  # 数据行：原行后追加对应翻译结果
                    new_row = row + [translated_results[row_idx]]
                csv_writer.writerow(new_row)

        print(f"\n翻译完成！结果已保存至 {output_path}")

    except Exception as e:
        print(f"保存CSV失败：{str(e)}")
        return


# -------------------------- 4. 执行主程序 --------------------------
if __name__ == "__main__":
    # 确保已替换APPID和APPKEY后再运行！
    if APPID == "INPUT_YOUR_APPID" or APPKEY == "INPUT_YOUR_APPKEY":
        print("错误：请先在代码中填写你的百度翻译API的AppID和AppKey！")
    else:
        process_csv(
            input_path="test_data.csv",  # 可修改为你的输入CSV路径
            output_path="final_data.csv"  # 可修改为你的输出CSV路径
        )

成功读取 test_data.csv，共 23 行数据，待翻译 23 条文本
开始翻译...
正在翻译第 1/23 条：statement...
翻译失败（原文：statement）：UNAUTHORIZED USER
正在翻译第 2/23 条：I won't go into too much detail but what stems off...
翻译失败（原文：I won't go into too much detail but what stems off of that is thoughts of being and feeling worthless like I'm not good enough for anyone and that I'm undeserving of the good things given to me but then another thought stems off of that one which would be something along the lines of me thinking how stupid I was acting and then another thought after that and so on. Another little thing I'd like to add is that that I seriously struggle seeing my own self worth and how much I mean to my friends and family (who are all amazing by the way I love them all so much). Eventually on fairly rare occasions those thoughts tend to lead to depression, paranoia that people around me hate me and don't want anything to do with me, dissociation with reality and then finally the rock bottom of that would be thoughts of sui