In [4]:
from sklearn.datasets import make_swiss_roll
from sklearn.model_selection import train_test_split

def generate_and_split_data(test_size, n_samples, noise, random_state):
    # Generate the Swiss roll dataset
    X, _ = make_swiss_roll(n_samples=n_samples, noise=noise, random_state=random_state)

    # Generate coherent labels
    y = [-1 if x[0] < 5 else 1 for x in X]

    # Perform train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

# Usage example
X_train, X_test, y_train, y_test = generate_and_split_data(test_size=0.5, n_samples=1000, noise=0.2, random_state=42)

In [5]:
def generated_data_to_AMPL(file_path, A, y, train=True):
    """
    Given a file name, the A matrix (points in space), and the y vector (labels), the training data processed using the above function,
    this function generates the code to express these mathematical objects in AMPL format and stores them in a .dat file.
    """
    m, n = A.shape  # Number of data points and number of features

    with open(file_path, 'w') as file:
        if train:
            file.write("param nu := 1;\n")  # Definition of parameter nu.
            file.write("param m := {};\n".format(m))  # Number of data points.
            file.write("param n := {};\n".format(n))  # Number of features.
            file.write("\n")

            file.write("param A_train:\n")
            file.write("\t")
            file.write("\t".join("{}".format(i+1) for i in range(n)))  # Column numeration
            file.write(" :=\n")

            for i in range(m):
                file.write("\t")
                file.write(str(i+1))  # Row numeration
                file.write("\t")
                file.write("\t".join("{:.3f}".format(A[i][j]) for j in range(n)))
                file.write("\n")

            file.write(";\n")
            file.write("\n")

            file.write("param y_train :=")
            for i in range(m):
                file.write("\n\t{} {:.3f}".format(i + 1, y[i]))

            file.write("\n;")

        else:
            file.write("param A_test:\n")
            file.write("\t")
            file.write("\t".join("{}".format(i+1) for i in range(n)))  # Column numeration
            file.write(" :=\n")

            for i in range(m):
                file.write("\t")
                file.write(str(i+1))  # Row numeration
                file.write("\t")
                file.write("\t".join("{:.3f}".format(A[i][j]) for j in range(n)))
                file.write("\n")

            file.write(";\n")
            file.write("\n")

            file.write("param y_test :=")
            for i in range(m):
                file.write("\n\t{} {:.3f}".format(i + 1, y[i]))

            file.write("\n;")

In [6]:
generated_data_to_AMPL('train.dat', X_train, y_train, train=True)
generated_data_to_AMPL('test.dat', X_test, y_test, train=False)