# Analyse the [Jigsaw's](https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data) data

## Libraries

In [1]:
import pandas as pd
from pathlib import Path
import os
import openai
from utils.api_batch_handler import OpenAIResponseHandler

## Global variables

In [2]:
ROOT = Path('.')
API_KEY_PATH = ROOT / ".." / ".." / "confidential" / "GPT_API.txt"
TRAIN_PATH = ROOT / "train.csv"
proxy = "socks5h://127.0.0.1:1080"
model = "gpt-4o-mini"
max_contents = 2000
max_tokens = 200
system_prompt_path = ROOT / "utils" / "system_prompt.txt"
system_prompt = system_prompt_path.read_text().strip()
min_length = 5
max_length = 25

## Load data

In [3]:
df_train = pd.read_csv(TRAIN_PATH)
df_train = df_train[df_train['comment_text'].apply(lambda x: len(x.split()) > min_length and len(x.split()) < max_length)]

In [4]:
df_toxic = df_train[df_train["toxic"] == 1]
df_severe_toxic = df_train[df_train["severe_toxic"] == 1]
df_obscene = df_train[df_train["obscene"] == 1]
df_threat = df_train[df_train["threat"] == 1]
df_insult = df_train[df_train["insult"] == 1]
df_identity_hate = df_train[df_train["identity_hate"] == 1]
df_non_toxic = df_train[
    (df_train["toxic"] == 0) &
    (df_train["severe_toxic"] == 0) &
    (df_train["obscene"] == 0) &
    (df_train["threat"] == 0) &
    (df_train["insult"] == 0) &
    (df_train["identity_hate"] == 0)
]

print(f"df_toxic: {len(df_toxic)}\ndf_severe_toxic: {len(df_severe_toxic)}\ndf_obscene: {len(df_obscene)}\ndf_threat: {len(df_threat)}\ndf_insult: {len(df_insult)}\ndf_identity_hate: {len(df_identity_hate)}\ndf_non_toxic: {len(df_non_toxic)}")

df_toxic: 7096
df_severe_toxic: 842
df_obscene: 4100
df_threat: 246
df_insult: 3849
df_identity_hate: 698
df_non_toxic: 43302


## Take `200` sample from each type of toxicity

In [5]:
df_merged = pd.concat([
    df_toxic.sample(200, random_state=42),
    df_severe_toxic.sample(200, random_state=42),
    df_obscene.sample(200, random_state=42),
    df_threat.sample(200, random_state=42),
    df_insult.sample(200, random_state=42),
    df_identity_hate.sample(200, random_state=42),
    df_non_toxic.sample(200, random_state=42)
], ignore_index=True)
df_merged = df_merged.sample(frac=1, random_state=42).reset_index(drop=True)
df_merged = df_merged.rename(columns={"id": "msg_id", "comment_text": "text"})
df_merged

Unnamed: 0,msg_id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,7169c1b94061458a,I Hate You\n\nAnd I hope you die in a fire. 12...,1,0,0,1,1,0
1,f6e6eb2e0e6dafa6,this guys a fuckin prick and if i ever come cr...,1,0,1,1,1,0
2,43330593d10e5dc6,"Fucking lying nigger, fes up you peice of shit...",1,1,1,0,1,1
3,620fa4109a8ef333,You fucking piece of shit \n\nI'm going to hun...,1,0,1,1,1,1
4,ae3eff588eca865c,Fucking white trash cunt\nCould you also be mo...,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...
1395,870b32770738ee23,== YOU ARE THE GAYEST KID IN THE FUCKING WORLD...,1,0,1,0,1,1
1396,493d65b86c3554c0,all chinks and gooks should be deported,1,0,1,0,1,1
1397,d4dc6fde411bd4c1,color of plates \n\nwhy does my state have two...,0,0,0,0,0,0
1398,1181b4b95a4daeb9,lame \n\nYou fuckin queer. Don't you have anyt...,1,1,1,0,1,0


In [6]:
df_merged["system_prompt"] = system_prompt
df_merged["prompt"] = df_merged["text"].apply(lambda x: f"English: {x}\n\nFrench:")

In [None]:
handler = OpenAIResponseHandler(
    df=df_merged,
    USE_PROXY=proxy,
    MODEL=model,
    MAX_CONTENTS=max_contents,
    batch_path=ROOT / "data" / "batch.jsonl",
    max_tokens=max_tokens,
    path_api_key=API_KEY_PATH,
    output_path=ROOT / "data" / "output.jsonl",
)

In [8]:
handler.run_pipeline()

Batch file written to batch.jsonl


## Data cleaning

When GPT doesn't answer because of moderation they add something like "je suis désolé" or "sorry", we will remove every answers that contain such elements.

In [12]:
df_results = pd.read_csv(ROOT / 'Jigsaw_French.csv')

In [13]:
#When GPT doesn't answer because of moderation they add something like "je suis désolé" or "sorry", we will remove every answers that contain such elements.
df_results = df_results[~df_results['response'].apply(lambda x: "je suis désolé" in str(x).lower())]
df_results = df_results[~df_results['response'].apply(lambda x: "i am sorry" in str(x).lower())]
df_results = df_results[~df_results['response'].apply(lambda x: "i can't assist" in str(x).lower())]
df_results = df_results[~df_results['response'].apply(lambda x: "je ne peux pas aider" in str(x).lower())]

In [14]:
for i,row in df_results.sample(10).iterrows():
    print('-'*40)
    print(f"msg_id: {row['msg_id']}")
    print(f"original: {row['text']}")
    print(f"translated: {row['response']}")
    

----------------------------------------
msg_id: c9c70092a460548a
original: You're sick. I have the right to edit pages about retards like Dora and Barney.
translated: Tu es malade. J'ai le droit de modifier des pages sur des débiles comme Dora et Barney.
----------------------------------------
msg_id: 0cff83251f6a4cab
original: You are my bitch! 

I own your ass.
translated: Tu es ma pute ! 

Je possède ton cul.
----------------------------------------
msg_id: a62dfd807b61b58f
original: I'll be back whenever I want to.  and then I'll bang your mom.
translated: Je reviendrai quand je veux. Et ensuite, je vais baiser ta mère.
----------------------------------------
msg_id: ffde5b109fa7a06a
original: Hi Pete 

You're a prick!

Yours sincerely, someone who would never dream of buying such a rag as the MoS.
translated: Salut Pete,

Tu es un con !

Cordialement, quelqu'un qui ne rêve même pas d'acheter une merde pareille comme le MoS.
----------------------------------------
msg_id: 60b1e

In [15]:
df_results.to_csv(ROOT / 'Jigsaw_French.csv', index=False)

In [17]:
df_results = df_results.drop(columns=["system_prompt", "prompt", "prompt_tokens", "completion_tokens", "total_tokens"])
df_results = df_results.rename(columns={"response": "content", "text": "original_text"})
df_results.to_csv(ROOT / ".." / ".." / "subsets_Di" / "subset_jigsaw.csv", index=False)
df_results.to_csv(ROOT / ".." / ".." / "subsets_Di_annotated" / "subset_jigsaw_gpt-4.1-mini.csv", index=False)

In [18]:
df_results

Unnamed: 0,msg_id,original_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,content
0,74795a18fe96b5cb,"""\nDude, you keep vandalizing it by adding spe...",0,0,0,0,0,0,"Mec, tu continues à le vandaliser en ajoutant ..."
1,d42e86cbdec5bd87,shut the fuck up u big dumb bitch ass pussy. i...,1,1,1,0,1,1,"ferme ta putain de gueule, espèce de grande co..."
2,a056d6e5e9475bc2,JD IS A RAGING HOMOSEXUAL FAGGOT WHO SHOULD BE...,1,0,1,1,1,1,JD EST UN PETIT P*DÉ EN COLÈRE QUI DEVRAIT ÊTR...
3,5f38031eb97e7a27,"the earth is 6000 years old, don't be stupid",1,0,0,0,1,0,"la Terre a 6000 ans, ne sois pas idiot"
4,669b4154433e1608,Why did you remove the information about the t...,0,0,0,0,0,0,Pourquoi as-tu enlevé l'information sur les de...
...,...,...,...,...,...,...,...,...,...
1395,122a4b51efb9f1f8,I will kill you with my gun asshole!,1,0,1,1,1,0,"Je vais te tuer avec mon flingue, espèce de co..."
1396,facd3642fb412268,Hey\nYou're a fucking cunt. Go kill yourself.,1,1,1,0,1,0,Hé \nTu es une putain de salope. Va te suicider.
1397,b21072bff8b47665,a crazy nerd who dislikes women,1,0,0,0,0,0,un geek fou qui n'aime pas les femmes
1398,1ce7b9a3f3d85631,"No, really check the fucking diffs you trigger...",1,0,1,0,1,0,"Non, vérifie vraiment les putains de diffs, es..."
