In [1]:
import pandas as pd

df = pd.read_csv("../data/wrime.tsv", sep="\t")

df["Readers_mean"] = df[["Reader1_Joy", "Reader2_Joy", "Reader3_Joy"]].mean(axis=1)
df["Readers_std"] = df[["Reader1_Joy", "Reader2_Joy", "Reader3_Joy"]].std(axis=1)
df["Difference"] = df["Writer_Joy"] - df["Readers_mean"]

df["Label"] = 0  # 客観3人のばらつきが大きい（標準偏差が1以上）
df.loc[df["Readers_std"] < 1, "Label"] = 1  # 主観と客観の差が少ない（差の絶対値が1以下）
df.loc[(df["Readers_std"] < 1) & (df["Difference"] > 1), "Label"] = 2 # 主観 > 客観
df.loc[(df["Readers_std"] < 1) & (df["Difference"] < -1), "Label"] = 3 # 主観 < 客観

columns = [
    "Sentence",
    "Writer_Joy",
    "Reader1_Joy",
    "Reader2_Joy",
    "Reader3_Joy",
    "Readers_mean",
    "Readers_std",
    "Difference",
    "Label",
]

train_df = df[df["Train/Dev/Test"] == "train"][columns]
valid_df = df[df["Train/Dev/Test"] == "dev"][columns]
test_df = df[df["Train/Dev/Test"] == "test"][columns]

print("訓練用データ", len(train_df))
print("検証用データ", len(valid_df))
print("評価用データ", len(test_df))

train_df.to_csv("../data/train.tsv", sep="\t", index=False)
valid_df.to_csv("../data/valid.tsv", sep="\t", index=False)
test_df.to_csv("../data/test.tsv", sep="\t", index=False)

訓練用データ 40000
検証用データ 1200
評価用データ 2000


In [2]:
df["Label"].value_counts().sort_index()

0     6641
1    31653
2     4386
3      520
Name: Label, dtype: int64

In [3]:
train_df["Label"].value_counts().sort_index()

0     6123
1    29356
2     4031
3      490
Name: Label, dtype: int64

In [4]:
valid_df["Label"].value_counts().sort_index()

0    221
1    759
2    202
3     18
Name: Label, dtype: int64

In [5]:
test_df["Label"].value_counts().sort_index()

0     297
1    1538
2     153
3      12
Name: Label, dtype: int64