In [None]:
import pandas as pd

# 1. Read the CSV file
input_csv = "/content/Russia-Ukraine_War_(2024).csv"
df = pd.read_csv(input_csv, encoding="utf-8")  # or another encoding if needed

# 2. Subset columns: ["target", "stance", "rawContent"]
df_subset = df[["target", "stance", "rawContent"]].copy()

# 3. Rename columns: "target" → "Target", "stance" → "Stance", "rawContent" → "Text"
df_subset.rename(
    columns={
        "target": "Target",
        "stance": "Stance",
        "rawContent": "Text"
    },
    inplace=True
)

# 4. Map stance from {"Against", "Favor"} to { -1, 1 }
def map_stance(value: str):
    if value.lower() == "against":
        return -1
    elif value.lower() == "favor":
        return 1
    return 0  # fallback if something else is encountered

df_subset["Stance"] = df_subset["Stance"].apply(map_stance)

# 5. Preprocess Turkish swears in the "Text" column
TURKISH_SWEARS = ["siktir", "orospu", "amına","sikeyim"]

def clean_swears(text: str) -> str:
    # Replace or censor each known swear word with "***"
    if not isinstance(text, str):
        return text  # in case text is NaN or something
    text_lower = text.lower()
    for swear in TURKISH_SWEARS:
        if swear in text_lower:
            text = text.replace(swear, "***")
            text = text.replace(swear.capitalize(), "***")
            text = text.replace(swear.upper(), "***")
    return text

df_subset["Text"] = df_subset["Text"].apply(clean_swears)

# 6. Save the final result
output_csv = "Russia_Ukraine_War_Stance.csv"
df_subset.to_csv(output_csv, index=False, encoding="utf-8-sig")
print(f"Done! Created {output_csv} with {len(df_subset)} rows.")

Done! Created Russia_Ukraine_War_Stance.csv with 8215 rows.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Read the processed CSV
df = pd.read_csv("/content/Russia-Ukraine_War_Stance.csv", encoding="utf-8")

# 2. Split DF into train (70%) and temp (30%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,   # 30% goes to temp
    random_state=42,  # for reproducible splits
    shuffle=True
)

# 3. Split temp_df into test (15%) and dev (15%)
#    Since temp_df is 30% of the total, half of it is 15% of the total.
test_df, dev_df = train_test_split(
    temp_df,
    test_size=0.5,    # half of temp => 15% of entire dataset
    random_state=42,
    shuffle=True
)

# 4. Save each split
train_df.to_csv("Russia-Ukraine_War_train.csv", index=False, encoding="utf-8-sig")
test_df.to_csv("Russia-Ukraine_War_test.csv", index=False, encoding="utf-8-sig")
dev_df.to_csv("Russia-Ukraine_War_dev.csv", index=False, encoding="utf-8-sig")

print(f"Train size: {len(train_df)} rows")
print(f"Test size: {len(test_df)} rows")
print(f"Dev size: {len(dev_df)} rows")


Train size: 5750 rows
Test size: 1232 rows
Dev size: 1233 rows


In [None]:
import pandas as pd
import re

# the CSV files to reprocess
csv_files = ["/content/Russia-Ukraine_War_train.csv", "/content/Russia-Ukraine_War_dev.csv", "/content/Russia-Ukraine_War_test.csv"]

# A list of another swear words in Turkish
TURKISH_SWEARS = [
    "sikim", "amk","piç", "pezevenk","oç"
]

def censor_swears(text: str) -> str:
    """
    Replace or censor known swear words in the given text.
    This example does a simple case-insensitive replacement
    with asterisks (***).
    """
    if not isinstance(text, str):
        return text  # If the entry is NaN or something else

    # do a regex-based approach for case-insensitive matching.
    # Each swear is turned into a pattern, e.g. r'(?i)siktir'
    # and replaced with ***.
    for swear in TURKISH_SWEARS:
        pattern = re.compile(swear, re.IGNORECASE)
        text = pattern.sub("***", text)

    return text

# Process each CSV
for file_path in csv_files:
    # 1) Read the CSV
    df = pd.read_csv(file_path, encoding="utf-8")

    # 2) Apply swear-word censoring on the "Text" column
    df["Text"] = df["Text"].apply(censor_swears)

    # 3) Save the updated file
    #    want to overwrite, use the same file_path
    df.to_csv(file_path, index=False, encoding="utf-8-sig")

    print(f"Processed swear words in {file_path} - updated file saved.")


Processed swear words in /content/Russia-Ukraine_War_train.csv - updated file saved.
Train size: 37 rows
Processed swear words in /content/Russia-Ukraine_War_dev.csv - updated file saved.
Train size: 35 rows
Processed swear words in /content/Russia-Ukraine_War_test.csv - updated file saved.
Train size: 36 rows
