***PUBLIC 70% /. PRIVATE: 50%***

In [None]:
# =========================================
# STEP 1: Upload datacards (CSV files)
# =========================================
from google.colab import files
import os
import shutil

print("Please upload your CSV files (datacards).")
uploaded = files.upload()

# Move uploaded files into a proper folder
os.makedirs("datacards", exist_ok=True)
for fname in uploaded.keys():
    shutil.move(fname, os.path.join("datacards", fname))
print(f" Moved {len(uploaded)} files into /content/datacards")

# =========================================
# STEP 2: Import libraries
# =========================================
import pandas as pd
import random

# =========================================
# STEP 3: Create output folder
# =========================================
os.makedirs("datacards_split", exist_ok=True)

# =========================================
# STEP 4: Split each CSV into 50% private
# =========================================
source_folder = "datacards"

for file in os.listdir(source_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(source_folder, file)
        print(f"Processing {file} ...")

        # Read file with encoding detection
        try:
            df = pd.read_csv(file_path, encoding='utf-8', dtype=str)
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='latin1', dtype=str)

        # Shuffle and select 50% for private set (preserving index order)
        private_df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

        # Save without altering delimiter


Please upload your CSV files (datacards).


Saving Fante-2024.csv to Fante-2024.csv
Saving Twi-2024_val.csv to Twi-2024_val.csv
Saving Kusaal-2024_val.csv to Kusaal-2024_val.csv
Saving Ewe-2024_val.csv to Ewe-2024_val.csv
Saving Ga-2024_val.csv to Ga-2024_val.csv
✅ Moved 5 files into /content/datacards
Processing Twi-2024_val.csv ...
Processing Ga-2024_val.csv ...
Processing Ewe-2024_val.csv ...
Processing Kusaal-2024_val.csv ...
Processing Fante-2024.csv ...


In [None]:
# =========================================
# STEP 5: Zip the split folder
# =========================================
shutil.make_archive("datacards_split", 'zip', "datacards_split")
print("✅ Zipped to datacards_split.zip")

# =========================================
# STEP 6: Download the zip
# =========================================
files.download("datacards_split.zip")


✅ Zipped to datacards_split.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


***PUBLIC 70% /. PRIVATE: 30%***

In [1]:
# =========================================
# STEP 1: Upload datacards (CSV files)
# =========================================
from google.colab import files
import os
import shutil

print("Please upload your CSV files (datacards).")
uploaded = files.upload()

# Move uploaded files into a proper folder
os.makedirs("datacards", exist_ok=True)
for fname in uploaded.keys():
    shutil.move(fname, os.path.join("datacards", fname))
print(f" Moved {len(uploaded)} files into /content/datacards")

# =========================================
# STEP 2: Import libraries
# =========================================
import pandas as pd
import random

# =========================================
# STEP 3: Create output folders
# =========================================
os.makedirs("datacards_split/public", exist_ok=True)
os.makedirs("datacards_split/private", exist_ok=True)

# =========================================
# STEP 4: Split each CSV into 70% public / 30% private
# =========================================
source_folder = "datacards"

for file in os.listdir(source_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(source_folder, file)
        print(f"Processing {file} ...")

        # Read file with encoding detection
        try:
            df = pd.read_csv(file_path, encoding='utf-8', dtype=str)
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='latin1', dtype=str)

        # Shuffle rows for randomness
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

        # Split 70/30
        split_index = int(0.7 * len(df))
        public_df = df.iloc[:split_index].reset_index(drop=True)
        private_df = df.iloc[split_index:].reset_index(drop=True)

        # Save splits
        public_df.to_csv(f"datacards_split/public/{file}", index=False)
        private_df.to_csv(f"datacards_split/private/{file}", index=False)

print(" All files split into 70% public and 30% private")

# =========================================
# STEP 5: Zip the split folder
# =========================================
shutil.make_archive("datacards_split", 'zip', "datacards_split")
print(" Zipped to datacards_split.zip")

# =========================================
# STEP 6: Download the zip
# =========================================
files.download("datacards_split.zip")


Please upload your CSV files (datacards).


Saving All-Data - Fante.csv to All-Data - Fante.csv
Saving All-Data - Twi[En-Twi].csv to All-Data - Twi[En-Twi].csv
Saving All-Data - Twi[Twi-En].csv to All-Data - Twi[Twi-En].csv
Saving All-Data - GA.csv to All-Data - GA.csv
Saving All-Data - Ewe.csv to All-Data - Ewe.csv
Saving All-Data - Kusaal.csv to All-Data - Kusaal.csv
✅ Moved 6 files into /content/datacards
Processing All-Data - Twi[Twi-En].csv ...
Processing All-Data - Fante.csv ...
Processing All-Data - Twi[En-Twi].csv ...
Processing All-Data - GA.csv ...
Processing All-Data - Kusaal.csv ...
Processing All-Data - Ewe.csv ...
✅ All files split into 70% public and 30% private
✅ Zipped to datacards_split.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>