In [15]:
import os
import re
import shutil
import zipfile

import pandas as pd

from em import COLUMNS_2P, compute_all_lq

### Loads the ALL SRIs to a pandas dataframe and saves CSV

In [16]:
names_a=['ID', 'Nt', 'Din', "W"]
names_z=['SRF']
names_t=['ID', 'Nt', 'Din', 'W', 'SRF']


def line_split(line):
    return re.findall(r'[^"\s]\S*|".+?"', line)

def load(zip_file, folder, manifest_fname="manifest", names=names_t):
    
    df = pd.read_csv( zip_file.open(manifest_fname), 
                     sep=" ",index_col=0,
                     names=names, engine="python")

    points = {'ID':[]}
    
    for index, row in df.iterrows():
        freq_lines = []
        try:
            with zip_file.open(folder + "/" + str(int(index),) +".sri") as fi1e:
                freq_idx = 0;
                points['ID'].append(index)
                for line in fi1e:
                    line = line.decode().strip()
                    if line.startswith('#') or line.startswith('!'): continue 
                    token = line.split()
                    if len(token) != len(COLUMNS_2P): print("Invalid header!") 
                    
                    values = [float(x) for x in token]
                    
                    for i in range(len(COLUMNS_2P)):
                        key = COLUMNS_2P[i]+"_"+str(freq_idx)
                        if key not in points: points[key] = []
                        points[key].append(values[i])
                    freq_idx = freq_idx + 1
        except KeyError as e:
            print(e)


    return (pd.concat((df, pd.DataFrame(points).set_index("ID")), axis=1 ), freq_idx)



In [17]:
# One file per turn per frequency
save_data = True

data_folder = '../data/inductGR_350nm_200GHz/'
data_file = '../data/inductGR_350nm_200GHz.zip'

if os.path.exists(data_folder): 
    shutil.rmtree(data_folder)
os.mkdir(data_folder)

zf_data = zipfile.ZipFile(data_file,  mode="r")


ind_TEST, freq_points_test = load(zf_data, f"inductGR_350nm_200GHz/indGR_test",
                manifest_fname= f"inductGR_350nm_200GHz/input_samples_test.in",
                names = names_a)


ind_TRAIN, freq_points_train = load(zf_data, f"inductGR_350nm_200GHz/indGR_train",
                manifest_fname= f"inductGR_350nm_200GHz/input_samples_training.in",
                names = names_a)

ind_TEST['SRF'] = [20e9]*len(ind_TEST) 
ind_TRAIN['SRF'] = [200e9]*len(ind_TRAIN) 

ind_TRAIN.dropna(inplace=True)


"There is no item named 'inductGR_350nm_200GHz/indGR_train/321.sri' in the archive"


In [18]:
lq_test = [None for i in range(freq_points_test)]
lq_train = [None for i in range(freq_points_train)]


for i, f in enumerate(range(freq_points_test)):
    c = [col+"_"+str(f) for col in COLUMNS_2P] 
    lq_test[i] =  compute_all_lq(ind_TEST[c[0]].values,ind_TEST[c[1:]].values)

for i, f in enumerate(range(freq_points_train)):
    c = [col+"_"+str(f) for col in COLUMNS_2P] 
    lq_train[i] =  compute_all_lq(ind_TRAIN[c[0]].values,ind_TRAIN[c[1:]].values)



In [19]:
for r, (index, row) in enumerate(ind_TEST.iterrows()):
    l_prev = lq_test[0][r,0]
    for i, f in enumerate(range(freq_points_test)):
        l_curr = lq_test[i][r,0]
        if l_prev > 0 and l_curr < 0:
            ind_TEST.loc[index, 'SRF'] = (ind_TEST.loc[index, 'freq_'+str(i)] + ind_TEST.loc[index, 'freq_'+str(i-1)])/2
            break

for r, (index, row) in enumerate(ind_TRAIN.iterrows()):
    l_prev = lq_train[0][r,0]
    
    for i, f in enumerate(range(freq_points_train)):
        l_curr = lq_train[i][r,0]
        if l_prev > 0 and l_curr < 0:
            ind_TRAIN.loc[index, 'SRF'] = (ind_TRAIN.loc[index, 'freq_'+str(i)] + ind_TRAIN.loc[index, 'freq_'+str(i-1)])/2
            break



In [20]:
dfs_train = [[] for i in range(freq_points_train)]
dfs_test = [[] for i in range(freq_points_test)]


for n in range(5):

    ind_nturn_TRAIN = ind_TRAIN[ind_TRAIN.Nt == (n+1)]
    ind_nturn_TEST = ind_TEST[ind_TEST.Nt == (n+1)] 

    zf_test = zipfile.ZipFile(data_folder + f'test_dataset_{n+1}T.csv.zip',  mode="w", compression=zipfile.ZIP_DEFLATED)
    zf_train = zipfile.ZipFile(data_folder + f'train_dataset_{n+1}T.csv.zip',  mode="w", compression=zipfile.ZIP_DEFLATED)


    for i, f in enumerate(range(freq_points_train)):
        c = names_t[1:] + [col+"_"+str(f) for col in COLUMNS_2P] 
        
        transf_TRAIN_2 = ind_nturn_TRAIN[c]
        if(i < freq_points_test): 
            transf_TEST_2 = ind_nturn_TEST[c]

        rm = {}
        for col in COLUMNS_2P:
            rm[col+"_"+str(f)] = col
        

        transf_TRAIN_2 = transf_TRAIN_2.rename(columns=rm)

        
        dfs_train[i].append(transf_TRAIN_2)
        
        if(i < freq_points_test): 
            transf_TEST_2 = transf_TEST_2.rename(columns=rm)
            dfs_test[i].append(transf_TEST_2)

        if save_data :
            if(i < freq_points_test): zf_test.writestr("test_dataset_"+str(f)+".csv", transf_TEST_2.to_csv())
            zf_train.writestr("training_dataset_"+str(f)+".csv", transf_TRAIN_2.to_csv())

    if save_data :
        zf_test.close()
        zf_train.close()

In [21]:
# One file per frequency
save_data = True

if save_data :
    zf_test = zipfile.ZipFile(data_folder + f'test_dataset_allT.csv.zip',  mode="w", compression=zipfile.ZIP_DEFLATED)
    zf_train = zipfile.ZipFile(data_folder + f'train_dataset_allT.csv.zip',  mode="w", compression=zipfile.ZIP_DEFLATED)

df_allT_train = [None]*freq_points_train
df_allT_test = [None]*freq_points_test

for i, f in enumerate(range(freq_points_train)):
    df_allT_train[i] = pd.concat(dfs_train[i],keys=range(1,6))
    if save_data :
        zf_train.writestr("training_dataset_"+str(f)+".csv", df_allT_train[i].to_csv())

    if(i < freq_points_test):
        df_allT_test[i] = pd.concat(dfs_test[i],keys=range(1,9))
        if save_data :
            zf_test.writestr("test_dataset_"+str(f)+".csv", df_allT_test[i] .to_csv())

if save_data :
    zf_test.close()
    zf_train.close()

In [22]:
# One with all 
df_allFT = pd.concat(df_allT_train,keys=range(freq_points_train))
df_allFT.to_csv(data_folder + 'train_dataset_allTF.csv.zip')


df_allFT = pd.concat(df_allT_test,keys=range(freq_points_test))
df_allFT.to_csv(data_folder + 'test_dataset_allTF.csv.zip')

###### Copyright (C) 2022 Instituto de Telecomunicações & IMSE CSIC