## 0. Prepare

In [None]:
import os
os.chdir("../")
print(os.getcwd())
import pandas as pd
from difflib import SequenceMatcher
import sqlite3
import re

from tqdm import tqdm
import matplotlib.pyplot as plt
import japanize_matplotlib

In [None]:
path = "data/collected_data.csv"
original_df = pd.read_csv(path)

path = "data/Japanese_group_template.xlsx"
group_cat = pd.read_excel(path)
group_cat = {k: v for k, v in zip(group_cat["Group"], group_cat["Category"])}

In [None]:
cols = ["no", "model", "format", "group", "check"] + list(original_df.columns[9:])
df = original_df.copy()[cols]
df["check"] = df["check"].map(lambda x: int(str(x)[:1]))

d = df.groupby(["model", "check"]).size().unstack().fillna(0).astype(int)
dic = {"Gemma":1, "Qwen":2, "LLM-jp":3}
d = d.sort_values("model", key=lambda x: x.map(dic)).reset_index()

df = df[df["check"] != 1].reset_index(drop=True)
for col in df.columns[4:]:
    df[col] = df[col].map(lambda x: str(x).replace("●", ""))
df["format"] = df["format"].map(lambda x: x.replace("opinion_pos","positive opinion").replace("opinion_neg","negative opinion"))
df["group"] = df["group"].map(lambda x: group_cat[x])
df.rename(columns={"group": "category"}, inplace=True)
print(df.shape)
df.head(3)

check,model,1,2,3
0,Gemma,3,439,3170
1,Qwen,1,1058,2553
2,LLM-jp,80,9,3523


In [None]:
models = ["Gemma", "Qwen", "LLM-jp"]
formats = df['format'].unique()
categories = df['category'].unique()
def sort_dic(li):
    return  {k: v+1 for k, v in zip(li, range(len(li)))}

In [None]:
# row_df: toxicity and sentiment

import numpy as np

row_df = df.copy()
row_df = row_df.melt(id_vars=["no"], var_name="option", value_name="text")
row_df["option"] = row_df["option"].apply(lambda x: f"{int(x.split('_')[1]):02d}")
row_df = row_df.sort_values(by=["no", "option"]).reset_index(drop=True)
print(f"Number of rows: {len(row_df)}")

row_df['text'] = row_df['text'].map(lambda x: "" if str(x) == "nan" else x)
row_df = row_df[row_df["text"] != ""].reset_index(drop=True)
print(f"Number of rows: {len(row_df)}")

row_df["ind"] = row_df["no"].astype(str) + "_" + row_df["option"]

## 1. Refusal rate

In [None]:
# Example: refusal rate for "model*format"

x1 = "model"
x2 = "format"
v1 = models
v2 = formats

refusal_df = df.groupby([x1, x2, "check"]).size().unstack().fillna(0).astype(int)
refusal_df = refusal_df.sort_values(x2, key=lambda x: x.map(sort_dic(v2))).sort_values(x1, key=lambda x: x.map(sort_dic(v1))).reset_index()
refusal_df["sum"] = refusal_df["2"] + refusal_df["3"]
refusal_df["refusal"] = refusal_df["2"] / refusal_df["sum"]
# refusal_df

## 2. Toxicity

In [None]:
# JP: https://zenn.dev/taro_tenugui/articles/3d318410dc7a1e
# Ofiicial: https://developers.perspectiveapi.com/s/docs-sample-requests?language=en_US

from googleapiclient import discovery
import json

API_KEY = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

client = discovery.build(
"commentanalyzer",
"v1alpha1",
developerKey=API_KEY,
discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
static_discovery=False,
)


In [None]:
import time
row_df["result"] = None

i = 0
while i < len(row_df):
    row = row_df.iloc[i,:]
    if row["result"] == "":
        text = row['text']
        print(f"=== {i}: {text} ======================")

        analyze_request = {
            'comment': {'text': text},
            'languages': ['ja'],  # Japanese
            'requestedAttributes': {'TOXICITY': {}}
        }

        response = client.comments().analyze(body=analyze_request).execute()
        print(response)
        row_df.at[i, "result"] = response
        time.sleep(1)  # wait 1min.
    i += 1

In [None]:
values = []
for i, row in row_df.iterrows():
    text = row["result"]
    try:
        text = str(text).replace("'", '"')
        dic = json.loads(text)
        values.append(dic['attributeScores']['TOXICITY']['summaryScore']['value'])
    except:
        print(i, text)

In [None]:
toxic_df = row_df.copy()
toxic_df["toxicity"] = values
toxic_df = toxic_df[["no","toxicity"]]
def f(x):
    return ",".join([str(round(i, 4)) for i in x])
toxic_df = toxic_df.groupby("no").agg({"toxicity": f}).reset_index()

## 3. Sentiment

In [None]:
import time
import requests

API_URL = "https://api-inference.huggingface.co/models/koheiduck/bert-japanese-finetuned-sentiment"
headers = {"Authorization": "Bearer hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	# time.sleep(0.1)
	return response.json()

def apply_senti(text):
	output = query({ "inputs": text})
	return [round(x['score'], 4) for x in output[0]]

In [None]:
i = len(all_dic)
print(f"Start from {i}")
while i < len(row_df):
    try:
        scores = apply_senti(row_df['text'][i])
        all_dic[i] = scores
        i += 1
    except:
        print("Error")
        break
    if i % 50 == 0:
        print(f"{i} done")


In [None]:
row_df['pos'] = [x[0] for x in all_scores]
row_df['neu'] = [x[1] for x in all_scores]
row_df['neg'] = [x[2] for x in all_scores]

def pos(x):
    return sum([1 for i in x if i == "POSITIVE"])
def neg(x):
    return sum([1 for i in x if i == "NEGATIVE"])
def neu(x):
    return sum([1 for i in x if i == "NEUTRAL"])

In [None]:
cols = ["no"] + row_df.columns[6:].tolist()
senti_df = row_df[cols].copy().groupby("no").agg({"label": [pos, neg, neu]})
senti_df['total'] = senti_df['label'].sum(axis=1)
senti_df.columns = ["pos", "neg", "neu", "total"]
senti_df['pn_score'] = round((senti_df['pos'] - senti_df['neg']) / senti_df['total'], 3)

senti_df = pd.merge(senti_df, df, on="no", how="left")
senti_df