In [2]:
import pandas as pd
import numpy as np
import os, json
from core import start_calc, start

df = pd.read_csv("../data/weibo/test.csv", usecols=["Speaker", "Utterance",
                                                          "Neuroticism", "Extraversion", "Openness",
                                                          "Agreeableness", "Conscientiousness"])
df = df[df["Agreeableness"] != "unknown"] # 剔除没有人格的人物
df.head(1)

Unnamed: 0,Speaker,Neuroticism,Extraversion,Openness,Agreeableness,Conscientiousness,Utterance
0,6035231493,high,low,low,low,low,🔝置顶这条用于存放个人分享资源和工具理想是传递开源、分享、互助的互联网精神


In [3]:
# 真实值
r_real = pd.DataFrame([], columns=["Speaker",
                                   "Neuroticism", "Extraversion", "Openness",
                                   "Agreeableness", "Conscientiousness"])
for v, row in pd.DataFrame(df.groupby(["Speaker"])["Utterance"].count()).sort_values(by=["Utterance"], ascending=True).iloc[:, :].iterrows():
    if not np.any(r_real["Speaker"] == v):
        # 保存真实值
        tmpB = df[df["Speaker"] == v].iloc[1, :]
        r_real = pd.concat([r_real, pd.DataFrame(
            [df[df["Speaker"] == v].iloc[1, :-1]]
        )])
r_real.head(3)

Unnamed: 0,Speaker,Neuroticism,Extraversion,Openness,Agreeableness,Conscientiousness
6288,5345320676,low,low,high,high,high
8857,5682320016,high,low,high,high,high
9198,5854265900,high,low,high,high,low


In [3]:
def save_gpt_res(path, v, send, content):
    pd.concat([
        pd.read_csv(path), pd.DataFrame(
            [[v, send, content]],
            columns = ["Speaker", "Utter", "Res_Utter"])]).to_csv(path, index=False)

filename = "weibo_form_gpt4_3"
# filename = "w_gpt3.5_4.csv"
path = "../data/self/{}.csv".format(filename)
pd.DataFrame([], columns=["Speaker", "Utter", "Res_Utter"]).to_csv(
    path, index=False)
for v, row in pd.DataFrame(
    df.groupby(["Speaker"])["Utterance"].count()
).sort_values(by=["Utterance"], ascending=True).iloc[0:10, :].iterrows():
    use_sentence = ""
    use_token = 0
    for i, row_j in df[df["Speaker"] == v].iterrows():
        use_token_sub = start_calc(use_sentence + row_j["Utterance"])
        if use_token_sub > 4000:  # 如果新增的超出token了，清空一下
            save_gpt_res(path, v, use_sentence, start(use_sentence))
            use_sentence = "{}: {}".format(v, row_j["Utterance"])  # 重置句子
        else:
            use_sentence += "\n{}: {}".format(v, row_j["Utterance"])
    save_gpt_res(path, v, use_sentence, start(use_sentence))


上面获取了gpt的结果，并进行了保存，下面就开始解析

In [15]:
# filename = "w_gpt3.5_4.csv"
filename = "weibo_form_gpt4_2"
path = "../data/self/{}.csv".format(filename)
import re
# 预测值
r_pre = pd.DataFrame([], columns=["Speaker",
                                  "Neuroticism", "Extraversion", "Openness",
                                  "Agreeableness", "Conscientiousness"])
gpt_res = pd.read_csv(path)
for v, row in gpt_res.iterrows():
    res_arr = re.findall(r"\{.*?\}", row["Res_Utter"], re.S)
    try:
        arr = json.loads(res_arr[len(res_arr) - 1])
        r_pre = pd.concat([r_pre, pd.DataFrame([
            [row["Speaker"], arr["神经质"], arr["外向性"],
                arr["开放性"], arr["宜人性"], arr["尽责性"]]],
            columns=["Speaker",
                     "Neuroticism", "Extraversion", "Openness",
                     "Agreeableness", "Conscientiousness"])])
    except Exception as e:
        print(e, len(res_arr), row["Speaker"], row["Res_Utter"])
        continue

In [10]:
# 新的计算准确率的方式
y_real = r_real
y_pre = r_pre
for v in ["Neuroticism", "Extraversion", "Openness","Agreeableness", "Conscientiousness"]:
    y_pre[v] = y_pre[v].replace({"低": "low", "高": "high", "中等": "medium", "不知道": "unknow"})

# 让真实值和预测值之间的长度统一
y_real = y_real.loc[[np.any(y_pre["Speaker"] == i) for i in y_real["Speaker"]],:]
y_pre.index = pd.Index([i for i in range(0, y_pre.shape[0])])
y_real.index = pd.Index([i for i in range(0, y_real.shape[0])])

# 开始计算准确率
r = []
for i, row in y_pre.iterrows():
    for v in ["Neuroticism", "Extraversion", "Openness","Agreeableness", "Conscientiousness"]:
        r.append(1 if y_real[y_real["Speaker"] == row["Speaker"]][v].to_list()[0] == row[v] else 0)

pd.DataFrame(np.array(r).reshape(int(len(r) / 5), 5), columns=["Neuroticism", "Extraversion", "Openness","Agreeableness", "Conscientiousness"]).mean()

Neuroticism          0.739130
Extraversion         0.586957
Openness             0.891304
Agreeableness        0.586957
Conscientiousness    0.434783
dtype: float64

In [64]:
# 去除无效的值
judge1 = lambda x: x if x == "high" or x == "low" or x == "高" or x == "低" else np.nan
y_real = r_real.applymap(judge1)
y_pre = r_pre.applymap(judge1)
# 转换为数字
def tran1(x):
    if x == "high" or x == "高":
        return 1
    elif x == "low" or x == "低":
        return 0
    else:
        return x
y_real = y_real.applymap(tran1)
y_pre = y_pre.applymap(tran1)
# 恢复索引
y_real["Speaker"] = r_real["Speaker"]
y_pre["Speaker"] = r_pre["Speaker"]
# 求平均值，以达到去重复
def tran2(x):
    if np.isnan(x) or x == 0.5:
        return np.nan
    elif x > 0.5:
        return 1
    else:
        return 0
y_pre = y_pre.groupby(by=["Speaker"]).mean().applymap(tran2)
y_real = y_real.groupby(by=["Speaker"]).mean()
# 让真实值和预测值之间的长度统一
y_real = y_real.loc[[np.any(y_pre.index == i) for i in y_real.index],:]

for v in ["Neuroticism", "Extraversion", "Openness", "Agreeableness", "Conscientiousness"]:
    print(v, end="\t")
    print(1 - np.sum(
        [abs(y_real[v][i] - (y_pre[v][i] if not np.isnan(y_pre[v][i]) else y_real[v][i] - 1)) for i in y_pre.index]) / y_pre.shape[0])

Neuroticism	0.6
Extraversion	0.5
Openness	1.0
Agreeableness	0.8
Conscientiousness	0.6
