# Imports & settings

In [11]:
import numpy as np
import os
import re

In [12]:
def load_generated_data(file_path):
    """
    Given a path to a .dat file containing the generated training dataset for our (SVM) classifier, this function
    loads the generated data in order to adapt it to AMPL optimization problem solver input.
    """

    # Data structures initialization.
    data = []
    labels = []
    test_labels = []
    
    # Reads the file and processes it.
    with open(file_path, 'r') as file:
        # Reads each line of the dataset.
        for line in file:
            line = line.strip()
            values = line.split()
            
            # Extracts the samples.
            point = [float(value) for value in values[:4]]
            data.append(point)
            
            # Extracts the sample label.
            label = float(values[4].rstrip('*'))
            labels.append(label)
    
    # Converts the data to numpy arrays.
    A = np.array(data)
    y = np.array(labels)
    
    return A, y

In [13]:
def generated_data_to_AMPL(file_path, A, y, train=True):
    """
    Given a path to a new file, the A matrix (samples), and the y vector (labels), this function generates 
    the code to express these mathematical objects in AMPL format and stores them in a .dat file.
    """

    # Number of samples and features.
    m, n = A.shape  

    with open(file_path, 'w') as file:
        if train:
            file.write("param nu := 1;\n")
            file.write("param m := {};\n".format(m))
            file.write("param n := {};\n".format(n))
            file.write("\n")

            file.write("param A_train:\n")
            file.write("\t")
            file.write("\t".join("{}".format(i+1) for i in range(n)))
            file.write(" :=\n")

            for i in range(m):
                file.write("\t")
                file.write(str(i+1))
                file.write("\t")
                file.write("\t".join("{:.3f}".format(A[i][j]) for j in range(n)))
                file.write("\n")

            file.write(";\n")
            file.write("\n")

            file.write("param y_train :=")
            for i in range(m):
                file.write("\n\t{} {:.3f}".format(i + 1, y[i]))

            file.write("\n;")

        else:
            file.write("param A_test:\n")
            file.write("\t")
            file.write("\t".join("{}".format(i+1) for i in range(n)))
            file.write(" :=\n")

            for i in range(m):
                file.write("\t")
                file.write(str(i+1))
                file.write("\t")
                file.write("\t".join("{:.3f}".format(A[i][j]) for j in range(n)))
                file.write("\n")

            file.write(";\n")
            file.write("\n")

            file.write("param y_test :=")
            for i in range(m):
                file.write("\n\t{} {:.3f}".format(i + 1, y[i]))

            file.write("\n;")

In [14]:
def process_generated_data(file_path, train=True):
    """
    This function applies the above functions, hence, loads generated data and processes it in order to be represented in AMPL format, 
    finally saves this data in a .dat file that can be used as input for the AMPL optimization problem solver.
    """

    # Loads the generated data.
    A, y = load_generated_data(file_path)

    # Output file containing the training generated data, corresponds to the
    # input data of the AMPL optimization problem to solve.
    base_filename = os.path.basename(file_path)
    match = re.search(r'\d+', base_filename)

    if match:
        last_number = match.group()
        if train:
            output_filename = f"svm_train_data_{last_number}.dat"
            generated_data_to_AMPL(output_filename, A, y, train=True)
        
        else: 
            output_filename = f"svm_test_data_{last_number}.dat"
            generated_data_to_AMPL(output_filename, A, y, train=False)

In [15]:
# Processes train datasets of diferent sizes.
file_names = ['svm_raw_train_data_100.dat', 'svm_raw_train_data_1000.dat', 'svm_raw_train_data_10000.dat']

for file_name in file_names:
    file_path = file_name
    process_generated_data(file_path, train=True)

In [None]:
# Processes test datasets of diferent sizes.
file_names = ['svm_raw_test_data_100.dat', 'svm_raw_test_data_1000.dat', 'svm_raw_test_data_10000.dat']

for file_name in file_names:
    file_path = file_name
    process_generated_data(file_path, train=False)