In [None]:
text_df = pd.read_excel("/content/vast_dev_translated.xlsx", dtype=str)
print(text_df.columns)

Index(['yazar', ' postalamak', ' ori_konu', ' ori_id', ' yeni_konu', ' etiket',
       ' tür_idx', ' yeni_kimlik', ' yay_kimliği', ' metin', ' konum_metni',
       ' metin_ler', ' başlık', ' konu_str', ' görülen?', ' içerik_konu?'],
      dtype='object')


author,post,ori_topic,ori_id,new_topic,label,type_idx,new_id,arc_id,text,pos_text,text_s,topic,topic_str,seen?,contains_topic?



*   author: username of the comment author
*   post: original comment, unprocessed
*   ori_id: id generated to link post and heuristically extracted topics
*   new_id: unique id for every comment-topic-label pair
*   arc_id: id of the original article on NYT
*   text: sentence and word tokenized and lowercased text, with punctuation and stopwords removed
*   change_lst: list of swapped words (unique to vast_test-sentswap.csv)
*   change_type: type of sentiment swapping
*   LexSim: a list of lexically similar training topics (if a zero-shot topic)
*   Qte: whether the example contains quotes (1=yes, 0=no)
*   Sarc: whether the example contains sarcasm (1=yes, 0=no)
*   Imp: whether the text contains the topic and the label is non-neutral (1=yes, 0=no)
*  mlS: whether there are other examples with the same document and different, non-neutral, stance labels (1=yes, 0=no)
*  mlT: whether there are other examples with the same document and different topics (1=yes, 0=no)
*   ori_topic: heuristically extracted topic
*   new_topic: updated topic from crowdsourced annotations
*   type_idx: type number, 1=HeurTopic, 2=CorrTopic, 3=ListTopic, 4=Synthetic neutral
*   topic: tokenized and lowercased version topic, with punctuation and stopwords removed
*   seen?: indicator for zero-shot or few-shot example, 0=zero-shot, 1=few-shot
*   contains_topic?: indicator for whether topic is contained in the text, 0=no, 1=yes


**label: stance label, 0=con, 1=pro, 2=neutral**
**text_s: string version of text**
**topic_str: string version of topic**




In [None]:
import pandas as pd

# List of your 5 XLSX input files
xlsx_files = [
    "/content/vast_dev_translated.xlsx",
    "/content/vast_train_translated.xlsx",
    "/content/vast_test_translated.xlsx"

]

# A list to collect data from all 5 files
all_data = []

for xlsx_path in xlsx_files:
    # 1) Read the XLSX into a DataFrame
    df = pd.read_excel(xlsx_path, dtype=str)

    # 2) Convert stance from str to numeric where possible
    #    'etiket' might be "0", "1", or "2" in text form.
    #    We'll force it to numeric type for mapping.
    df[" etiket"] = pd.to_numeric(df[" etiket"], errors="coerce")

    # 3) Map the numeric stance to the new scheme:
    #       original → new
    #       0=against → -1
    #       1=favor   →  1
    #       2=none    →  0
    def map_stance(value):
        if value == 0:
            return -1   # against
        elif value == 1:
            return 1    # favor
        elif value == 2:
            return 0    # none
        return None     # fallback if missing

    df["etiket_mapped"] = df[" etiket"].apply(map_stance)

    # 4) Subset the columns we want: topic_str, etiket_mapped, text_s
    subset_df = df[[" konu_str", "etiket_mapped", " postalamak"]].copy()

    # 5) Rename columns to [Target, Stance, Text]
    subset_df.rename(
        columns={
            " konu_str": "Target",
            "etiket_mapped": "Stance",
            " postalamak": "Text"
        },
        inplace=True
    )

    # 6) Append to all_data
    all_data.append(subset_df)

# 7) Concatenate all 5 dataframes
final_df = pd.concat(all_data, ignore_index=True)

# 8) Save to CSV
output_csv = "VAST_stance.csv"
final_df.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Done! Created {output_csv} with {len(final_df)} rows.")


Done! Created VAST_stance.csv with 18545 rows.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Read the processed CSV
df = pd.read_csv("/content/VAST_stance.csv", encoding="utf-8")

# 2. Split DF into train (70%) and temp (30%)
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,   # 30% goes to temp
    random_state=42,  # for reproducible splits
    shuffle=True
)

# 3. Split temp_df into test (15%) and dev (15%)
#    Since temp_df is 30% of the total, half of it is 15% of the total.
test_df, dev_df = train_test_split(
    temp_df,
    test_size=0.5,    # half of temp => 15% of entire dataset
    random_state=42,
    shuffle=True
)

# 4. Save each split
train_df.to_csv("VAST_train.csv", index=False, encoding="utf-8-sig")
test_df.to_csv("VAST_test.csv", index=False, encoding="utf-8-sig")
dev_df.to_csv("VAST_dev.csv", index=False, encoding="utf-8-sig")

print(f"Train size: {len(train_df)} rows")
print(f"Test size: {len(test_df)} rows")
print(f"Dev size: {len(dev_df)} rows")


Train size: 12981 rows
Test size: 2782 rows
Dev size: 2782 rows
