In [None]:
import random
import os
import shutil
import re
import csv
from itertools import product
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#split the scattering profiles into train and test
random.seed(51)
#xy folder contains 3000 scattering profile files
input_folder = "/content/drive/MyDrive/CREASE-2D/xy"
#output paths to separate train and test
output_train_folder = "/content/drive/MyDrive/CREASE-2D/Crease_2400_126/train"
output_test_folder = "/content/drive/MyDrive/CREASE-2D/Crease_2400_126/test"
all_files = os.listdir(input_folder)
random.shuffle(all_files)
total_files = len(all_files)
train_size = 2400
test_size = total_files - train_size
train_files = all_files[:train_size]
test_files = all_files[train_size:]
sample_id_info = []
def copy_files(files, source_folder, destination_folder):
    for file_name in files:
        source_path = Path(source_folder) / file_name
        destination_path = Path(destination_folder) / file_name
        #Here original scattering profiles are copied to create train and test folders. We can also move them instead of copying to make it faster
        shutil.copy2(source_path, destination_path)
#we idientifed each scattering profile with unique sample id's given as file names
def collect_sample_ids(files, destination_folder):
    '''
    function extracts sample ID from file name and assigned folder either train or test
    '''
    for file_name in files:
        sample_id = file_name.split('_')[1]
        sample_id_info.append({
            'Sample ID': sample_id,
            'Folder': destination_folder
        })
#creates output folders
os.makedirs(output_train_folder, exist_ok=True)
os.makedirs(output_test_folder, exist_ok=True)
#copy files to output folders
copy_files(train_files, input_folder, output_train_folder)
copy_files(test_files, input_folder, output_test_folder)
#collect sample IDs and specific folder names
collect_sample_ids(train_files, output_train_folder)
collect_sample_ids(test_files, output_test_folder)
csv_file_path = "/content/drive/MyDrive/CREASE-2D/Crease_2400_126/sample_id_info.csv"
sample_id_df = pd.DataFrame(sample_id_info)
sample_id_df.to_csv(csv_file_path, index=False)

In [None]:
#This contains 6 input features used to generate 3D structure
df1 = pd.read_csv('/content/drive/MyDrive/CREASE-2D/Features_6.csv')
#This csv file contains all the sample IDs and their locations
df2 = pd.read_csv('/content/drive/MyDrive/CREASE-2D/Crease_2400_126/sample_id_info.csv')
unique_folders = df2['Folder'].unique()
dfs_by_folder = {}
#identify the train and test features and separate them
for folder in unique_folders:
    sample_ids = df2.loc[df2['Folder'] == folder, 'Sample ID']
    df_temp = df1[df1['Sample ID'].isin(sample_ids)]
    dfs_by_folder[folder] = df_temp
df_train = dfs_by_folder['/content/drive/MyDrive/CREASE-2D/Crease_2400_126/train']
df_test = dfs_by_folder['/content/drive/MyDrive/CREASE-2D/Crease_2400_126/test']
df_train.to_csv('/content/drive/MyDrive/CREASE-2D/Crease_2400_126/train/train_features.csv',index=False)
df_test.to_csv('/content/drive/MyDrive/CREASE-2D/Crease_2400_126/test/test_features.csv',index=False)

In [None]:
#create a grid to locate each intensity value
theta = np.linspace(0, np.pi, 37)
log_q = np.linspace(-2,3,126)
combinations = list(product(log_q,theta))
log_q_mesh, theta_mesh = np.meshgrid(log_q,theta,indexing='ij')
#add these grid to each set of features
combined_data = df_train.loc[np.repeat(df_train.index.values,len(combinations))]
combined_data['log_q'] = np.tile(log_q_mesh.ravel(),len(df_train))
combined_data['theta'] = np.tile(theta_mesh.ravel(),len(df_train))
combined_data.shape

(11188800, 9)

In [None]:
#grid based sub-sampling on scattering profiles
input_ = '/content/drive/MyDrive/CREASE-2D/Crease_2400_126/train'
output_ = '/content/drive/MyDrive/CREASE-2D/Crease_2400_126/downsampled_train_126/'
if not os.path.exists(output_):
    os.makedirs(output_)
for f in os.listdir(input_):
    if f.endswith('xy.txt'):
        inp_path = os.path.join(input_, f)
        print(f"Processing: {inp_path}")
        out_path = os.path.join(output_, f)
        image_data = np.loadtxt(inp_path, delimiter=',')
        downsampled = np.zeros((126, 37))
        for i in range(126):
            for j in range(37):
                #subsample a row from every 4th row and a column from every 5th column
                downsampled[i, j] = image_data[i * 4, j * 5]
        np.savetxt(out_path, downsampled, delimiter=',')
        print(f"Saved: {out_path}")

In [None]:
#extract intensities with (q,theta) from subsampled files
data_folder = "/content/drive/MyDrive/CREASE-2D/Crease_2400_126/downsampled_train_126/"
data = []
for filename in os.listdir(data_folder):
    if filename.endswith(".txt"):
        match = re.search(r'\d+', filename)
        #add sample ID as well to indentify as unique id
        first_number = int(match.group()) if match else None
        file_path = os.path.join(data_folder, filename)
        matrix_data = []
        with open(file_path, "r") as txt_file:
            for line in txt_file:
                values = line.strip().split(",")
                row = list(map(float, values))
                matrix_data.append(row)
        #flatten the matrix and and add locations of it using q and theta
        flattened_data = [value for row in matrix_data for value in row]
        combinations = [(log_q_value, theta_value) for log_q_value in log_q for theta_value in theta]
        for idx, (log_q_value, theta_value) in enumerate(combinations):
            intensity = flattened_data[idx]
            data.append([log_q_value, theta_value, intensity, first_number])
#create a dataframe of location q and theta with intensity values
df1 = pd.DataFrame(data, columns=["log_q", "theta", "I_q", "Sample ID"])
#combine the features, q, theta and intensty to form train dataset
train_df = pd.merge(combined_data, df1, on=['Sample ID','log_q','theta'], sort=True, copy=False)
print(train_df.shape)

(11188800, 10)


In [None]:
#All the above process that's been doing on train set is done for test set here
test_data = df_test.loc[np.repeat(df_test.index.values,len(combinations))]
test_data['log_q'] = np.tile(log_q_mesh.ravel(),len(df_test))
test_data['theta'] = np.tile(theta_mesh.ravel(),len(df_test))
input_ = '/content/drive/MyDrive/CREASE-2D/Crease_2400_126/test/'
output_ = '/content/drive/MyDrive/CREASE-2D/Crease_2400_126/downsampled_test_126/'
if not os.path.exists(output_):
    os.makedirs(output_)
for f in os.listdir(input_):
    if f.endswith('xy.txt'):
        inp_path = os.path.join(input_, f)
        print(f"Processing: {inp_path}")
        out_path = os.path.join(output_, f)
        image_data = np.loadtxt(inp_path, delimiter=',')
        downsampled = np.zeros((126, 37))
        for i in range(126):
            for j in range(37):
                downsampled[i, j] = image_data[i * 4, j * 5]
        np.savetxt(out_path, downsampled, delimiter=',')
        print(f"Saved: {out_path}")
data_folder = "/content/drive/MyDrive/CREASE-2D/Crease_2400_126/downsampled_test_126/"
data = []
for filename in os.listdir(data_folder):
    if filename.endswith(".txt"):
        match = re.search(r'\d+', filename)
        first_number = int(match.group()) if match else None
        file_path = os.path.join(data_folder, filename)
        matrix_data = []
        with open(file_path, "r") as txt_file:
            for line in txt_file:
                values = line.strip().split(",")
                row = list(map(float, values))
                matrix_data.append(row)
        flattened_data = [value for row in matrix_data for value in row]
        combinations = [(log_q_value, theta_value) for log_q_value in log_q for theta_value in theta]
        for idx, (log_q_value, theta_value) in enumerate(combinations):
            intensity = flattened_data[idx]
            data.append([log_q_value, theta_value, intensity, first_number])
df2 = pd.DataFrame(data, columns=["log_q", "theta", "I_q", "Sample ID"])
#combine the features, q, theta and intensty to form test dataset
test_df = pd.merge(test_data, df2, on=['Sample ID','log_q','theta'], sort=True, copy=False)
print(test_df.shape)

In [None]:
test_df.to_csv('/content/drive/MyDrive/CREASE-2D/Crease_2400_126/test_dataset_2400_126.csv',index=False)
train_df.to_csv('/content/drive/MyDrive/CREASE-2D/Crease_2400_126/train_dataset_2400_126.csv',index=False)