<a href="https://colab.research.google.com/github/oumlk/french-wino-what/blob/main/french_wino_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### French Winograd Schemas dataset (Amsili & Seminck, 2017) XML




In [None]:
from google.colab import drive
drive.mount('/content/drive')

xml_path = "/content/drive/MyDrive/Thesis_2025/French_Wino_Schemas.xml"


Mounted at /content/drive


In [None]:
import xml.etree.ElementTree as ET

XML_PATH = "/content/drive/MyDrive/Thesis_2025/French_Wino_Schemas.xml"

tree = ET.parse(XML_PATH)
root = tree.getroot()

print("First 5 schemas from the XML file:")
for i, schema in enumerate(root.findall('.//schema')):
    if i >= 5:
        break
    print(f"Schema {i+1}:\n{ET.tostring(schema, encoding='unicode')}\n")

First 5 schemas from the XML file:
Schema 1:
<schema id="1" engn="02">
  <text>
    <txt1> La coupe n'entre pas dans la valise marron, car elle est trop </txt1>
    <wordA>grande</wordA>
    <wordB>petite</wordB>
    <txt2>.</txt2>
  </text>
  <question>
    <qn1>Qu'est-ce qui est trop </qn1>
    <qwordA>grand</qwordA>
    <qwordB>petit</qwordB>
    <qn2> ?</qn2>
  </question>
  <answer1>la coupe</answer1>
  <answer2>la valise</answer2>
</schema>



Schema 2:
<schema id="2" engn="04">
  <text>
    <txt1> Paul a essayé de joindre Georges sur son téléphone, mais il </txt1>
    <wordA>n'a pas réussi</wordA>
    <wordB>n'a pas répondu</wordB>
    <txt2>.</txt2>
  </text>
  <question>
    <qn1>Qui </qn1>
    <qwordA>n'a pas réussi</qwordA>
    <qwordB>n'a pas répondu</qwordB>
    <qn2> ?</qn2>
  </question>
  <answer1>Paul</answer1>
  <answer2>Georges</answer2>
</schema>



Schema 3:
<schema id="3" engn="05">
  <text>
    <txt1> L'avocat a posé une question au témoin, mais il a refusé </txt

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

XML_PATH = "/content/drive/MyDrive/Thesis_2025/French_Wino_Schemas.xml"
CSV_PATH = "/content/drive/MyDrive/Thesis_2025/french_winograd_schemas.csv"

tree = ET.parse(XML_PATH)
root = tree.getroot()

rows = []

for schema in root.findall(".//schema"):
    schema_id = schema.attrib.get("id")

    # Use findtext with a default value to safely extract text content
    # This prevents AttributeError if an element is missing
    text_element = schema.find("text")
    if text_element is not None:
        txt1 = text_element.findtext("txt1", default="").strip()
        txt2 = text_element.findtext("txt2", default="").strip()
        wordA = text_element.findtext("wordA", default="").strip()
        wordB = text_element.findtext("wordB", default="").strip()
        sentence = f"{txt1} _ {txt2}"
    else:
        txt1 = ""
        txt2 = ""
        wordA = ""
        wordB = ""
        sentence = ""

    answer1 = schema.findtext("answer1", default="").strip()
    answer2 = schema.findtext("answer2", default="").strip()

    rows.append({
        "id": schema_id,
        "sentence": sentence,
        "option1": wordA,
        "option2": wordB,
        "answer1": answer1,
        "answer2": answer2
    })

df = pd.DataFrame(rows)

df.head()

Unnamed: 0,id,sentence,option1,option2,answer1,answer2
0,1,"La coupe n'entre pas dans la valise marron, ca...",grande,petite,la coupe,la valise
1,2,Paul a essayé de joindre Georges sur son télép...,n'a pas réussi,n'a pas répondu,Paul,Georges
2,3,"L'avocat a posé une question au témoin, mais i...",de la répéter,d'y répondre,l'avocat,le témoin
3,4,Nicolas n'a pas pu soulever son fils car il ét...,faible,lourd,Nicolas,son fils
4,5,"Les lycéens harcelaient les collégiens, donc o...",punis,défendus,les lycéens,les collégiens


In [None]:
df.to_csv(CSV_PATH, index=False)

print(f"Saved {len(df)} schemas to {CSV_PATH}")

Saved 107 schemas to /content/drive/MyDrive/Thesis_2025/french_winograd_schemas.csv


After converting the original XML file into a tabular format, I performed an intermediate normalization step to make the structure compatible with the WinoWhat framework. In this step, the placeholder "_" in each sentence is temporarily filled with the first candidate option1, and the original option columns are removed. The two candidate antecedents provided in the dataset are then stored as option1 and option2, corresponding to the entities implied by each alternative completion. A new column answer is added to encode the correct choice as a binary label. In a subsequent step, the placeholder token _ is reintroduced into the sentence to match the surface format used in the English WinoWhat dataset.

In [None]:
import pandas as pd

# 1. Replace "_" in the sentence with the original option1
df["sentence"] = df.apply(
    lambda row: row["sentence"].replace("_", row["option1"], 1),
    axis=1
)

# 2. Drop the original option1 and option2 columns (the fillers)
df = df.drop(columns=["option1", "option2"])

# 3. Rename answer columns to option columns (entities)
df = df.rename(columns={
    "answer1": "option1",
    "answer2": "option2"
})

# 4. Add an empty answer column (to be filled manually)
df["answer"] = ""

# Inspect result
df.head()


Unnamed: 0,id,sentence,option1,option2,answer
0,1,"La coupe n'entre pas dans la valise marron, ca...",la coupe,la valise,
1,2,Paul a essayé de joindre Georges sur son télép...,Paul,Georges,
2,3,"L'avocat a posé une question au témoin, mais i...",l'avocat,le témoin,
3,4,Nicolas n'a pas pu soulever son fils car il ét...,Nicolas,son fils,
4,5,"Les lycéens harcelaient les collégiens, donc o...",les lycéens,les collégiens,


In [None]:
# Save to CSV
OUTPUT_CSV_PATH = "/content/drive/MyDrive/Thesis_2025/french_winograd_intermediate_entities.csv"
df.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"Saved transformed dataset to: {OUTPUT_CSV_PATH}")


Saved transformed dataset to: /content/drive/MyDrive/Thesis_2025/french_winograd_intermediate_entities.csv


Unnamed: 0,id,sentence,option1,option2,answer
0,1,"La coupe n'entre pas dans la valise marron, ca...",la coupe,la valise,
1,2,Paul a essayé de joindre Georges sur son télép...,Paul,Georges,
2,3,"L'avocat a posé une question au témoin, mais i...",l'avocat,le témoin,
3,4,Nicolas n'a pas pu soulever son fils car il ét...,Nicolas,son fils,
4,5,"Les lycéens harcelaient les collégiens, donc o...",les lycéens,les collégiens,


The initial version of the dataset (french_wino_unrandomized.csv) was constructed such that the correct answer was consistently stored as the first candidate option. While this representation preserves the intended commonsense interpretation of each instance, it may introduce positional bias during model evaluation.

To mitigate this effect, we perform an option randomization step. For each instance independently, the order of the two candidate options is randomly swapped with a probability of 0.5. When a swap occurs, the gold label is updated accordingly (i.e., labels 1 and 2 are exchanged). This procedure preserves the semantic correctness of each instance while ensuring that the position of the correct answer is not systematically correlated with a fixed option index.

The randomized dataset is stored as french_wino_randomized.csv. A fixed random seed is used to ensure reproducibility

In [None]:
import pandas as pd
import numpy as np

# Paths
INPUT_CSV = "/content/drive/MyDrive/Thesis_2025/french_wino_unrandomized.csv"
OUTPUT_CSV = "/content/drive/MyDrive/Thesis_2025/french_wino_randomized.csv"

# Load data
df = pd.read_csv(INPUT_CSV)

# Set seed for reproducibility
np.random.seed(42)

def randomize_row(row):
    # 50% chance to swap
    if np.random.rand() < 0.5:
        # Swap options
        row["option1"], row["option2"] = row["option2"], row["option1"]

        # Flip answer
        if row["answer"] == 1:
            row["answer"] = 2
        elif row["answer"] == 2:
            row["answer"] = 1

    return row

# Apply randomization
df = df.apply(randomize_row, axis=1)

# Save randomized dataset
df.to_csv(OUTPUT_CSV, index=False)

print(f"Randomized dataset saved to: {OUTPUT_CSV}")
df.head()


Randomized dataset saved to: /content/drive/MyDrive/Thesis_2025/french_wino_randomized.csv


Unnamed: 0,id,sentence,option1,option2,answer
0,1,"La coupe n'entre pas dans la valise marron, ca...",la valise,la coupe,2
1,2,Paul a essayé de joindre Georges sur son télép...,Paul,Georges,1
2,3,"L'avocat a posé une question au témoin, mais _...",l'avocat,le témoin,1
3,4,Nicolas n'a pas pu soulever son fils car _ éta...,Nicolas,son fils,1
4,5,"Les lycéens harcelaient les collégiens, donc o...",les collégiens,les lycéens,2


In [None]:
# Add paraphrased column:
if "paraphrased_sentence" not in df.columns:
    df["paraphrased_sentence"] = ""

df.to_csv(OUTPUT_CSV, index=False)

df.head()

Unnamed: 0,id,sentence,option1,option2,answer,paraphrased_sentence
0,1,"La coupe n'entre pas dans la valise marron, ca...",la valise,la coupe,2,
1,2,Paul a essayé de joindre Georges sur son télép...,Paul,Georges,1,
2,3,"L'avocat a posé une question au témoin, mais _...",l'avocat,le témoin,1,
3,4,Nicolas n'a pas pu soulever son fils car _ éta...,Nicolas,son fils,1,
4,5,"Les lycéens harcelaient les collégiens, donc o...",les collégiens,les lycéens,2,
