# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Set paths

In [2]:
# Base path for the historical_mt directory (relative to the current working directory)
base_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Path to dataset directory
dataset_path = os.path.join(base_path, 'data', 'datasets')

# Path to dataset
file_path = os.path.join(dataset_path, "cleaned_historical_dataset.xlsx")

# Path to indices
splits_path = os.path.join(dataset_path, "splits")
train_indices_path = os.path.join(splits_path, "train_indices.txt")
validation_indices_path = os.path.join(splits_path, "validation_indices.txt")
test_indices_path = os.path.join(splits_path, "test_indices.txt")

# Define file paths for saving train, validation and test datasets
train_save_path = os.path.join(dataset_path, "train_dataset.xlsx")
validation_save_path = os.path.join(dataset_path, "validation_dataset.xlsx")
test_save_path = os.path.join(dataset_path, "test_dataset.xlsx")

# Example output to ensure paths are correct
print(f"Dataset file path: {file_path}")
print(f"Train indices path: {train_indices_path}")
print(f"Validation indices path: {validation_indices_path}")
print(f"Test indices path: {test_indices_path}")
print(f"Train dataset save path: {train_save_path}")
print(f"Validation dataset save path: {validation_save_path}")
print(f"Test dataset save path: {test_save_path}")

Dataset file path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/cleaned_historical_dataset.xlsx
Train indices path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/splits/train_indices.txt
Validation indices path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/splits/validation_indices.txt
Test indices path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/splits/test_indices.txt
Train dataset save path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/train_dataset.xlsx
Validation dataset save path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/validation_dataset.xlsx
Test dataset save path: /cs/student/msc/csml/2023/ngriessh/historical_mt/data/datasets/test_dataset.xlsx


# Load dataset

In [3]:
# Load excel file
df = pd.read_excel(file_path)
df.head()

Unnamed: 0,Early Modern Bohemian German,English
0,Schwert hat zuegesagt das was er dieß mahl ver...,Schwert promises that he will not commit the o...
1,Jacob Nünner der Müllner. Demnach den Wolgebor...,"Jacob Nünner der Müllner. Since Jacob Nünner, ..."
2,derer von Schwantz vnterthanen Zum Neundorff. ...,The sers of the von Schwantz in Neundorff. Sin...
3,Melchior Rössels Zue Rückersdorff handtgelübni...,Melchior Rössel from Rückersdorf's hand-promis...
4,Hanß Nicht vnnd Mathes weber von Raspenaw. Dem...,Hanß Nicht and Mathes weber from Raspenaw. Bec...


# Generate test and training split

In [4]:
# Set a random seed for reproducibility
seed = 17

# Determine the length of the dataset
data_length = len(df)

# Calculate the test_size as a fraction to get exactly 1000 test samples
test_size = 1000 / data_length

# Perform the train-test split
train_indices, test_indices = train_test_split(
    np.arange(data_length), test_size=test_size, random_state=seed
)

# Ensure we have exactly 1000 test samples
if len(test_indices) > 1000:
    # Move excess samples to train set
    excess = len(test_indices) - 1000
    train_indices = np.concatenate((train_indices, test_indices[:excess]))
    test_indices = test_indices[excess:]
elif len(test_indices) < 1000:
    # Move samples from train set to test set
    shortage = 1000 - len(test_indices)
    test_indices = np.concatenate((test_indices, train_indices[:shortage]))
    train_indices = train_indices[shortage:]

# Calculate the validation size as 10% of the training data
validation_size = int(0.1 * len(train_indices))

# Perform the train-validation split
train_indices, validation_indices = train_test_split(
    train_indices, test_size=validation_size, random_state=seed
)

# Verify the split
print(f"Number of training samples: {len(train_indices)}")
print(f"Number of validation samples: {len(validation_indices)}")
print(f"Number of test samples: {len(test_indices)}")

Number of training samples: 842
Number of validation samples: 93
Number of test samples: 1000


# Save training, validation and test splits

In [5]:
# Save train, validation, and test indices to text files
np.savetxt(train_indices_path, train_indices, fmt='%d')
np.savetxt(test_indices_path, test_indices, fmt='%d')
np.savetxt(validation_indices_path, validation_indices, fmt='%d')

# Save training, validation and test datasets

In [6]:
# Generate train and test datasets
train_data = df.iloc[train_indices]
validation_data = df.iloc[validation_indices]
test_data = df.iloc[test_indices]

# Save the train_data DataFrame to an Excel file
train_data.to_excel(train_save_path, index=False)

# Save the validation_data DataFrame to an Excel file
validation_data.to_excel(validation_save_path, index=False)

# Save the test_data DataFrame to an Excel file
test_data.to_excel(test_save_path, index=False)

In [7]:
train_data.head()

Unnamed: 0,Early Modern Bohemian German,English
59,"Der Schulteß Zu Mildenaw, hatt Hanß Nichten vn...",The village headman of Mildenau names Hans Nic...
219,"Jacob Seidel von hainerßdorff, demnach er Mich...","Jacob Seidel from Hainersdorf, since he attack..."
1187,"Auf Grundtmans Klage antwortett der Scholtz, d...",To Grundtman's complaint the village headman a...
380,"Hans Nicht Von Mildenaw, demnach er mit seiner...","Hans Nicht von Mildenau, since he got into gre..."
1833,"Richter Sagett Vnd clagett, dz Joachim Jacobiz...",The magistrate complains that Joachim Jacobiz ...


# Create json files

In [8]:
# Save datasets to JSON Lines (.jsonl) format
train_data.to_json("train.jsonl", orient="records", lines=True, force_ascii=False)
validation_data.to_json("validation.jsonl", orient="records", lines=True, force_ascii=False)
test_data.to_json("test.jsonl", orient="records", lines=True, force_ascii=False)