## Data preprocessing for LSTM

In [None]:
import pandas as pd
from sklearn.utils import shuffle

In [None]:
vgraphs = ['adder', 'arbiter', 'bar', 'div', 'log2', 'max', 'multiplier', 'sin', 'sqrt', 'square', 'voter']
new_name = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'a99']

In [None]:
# read the flows
f10=pd.read_csv('dataset-generation/flow_10.csv',header=None)
f15=pd.read_csv('dataset-generation/flow_15.csv',header=None)
f20=pd.read_csv('dataset-generation/flow_20.csv',header=None)
f25=pd.read_csv('dataset-generation/flow_25.csv',header=None)

In [None]:
# flow = pd.DataFrame()
# area = pd.DataFrame()
# delay = pd.DataFrame()
flow = []
area = []
delay = []

In [None]:
data_path = 'dataset-ground-truth/'
for i in range(11):
    design_name = vgraphs[i]
    name10 = pd.DataFrame([new_name[i]+';' for j in range(50000)])
    name20 = pd.DataFrame([new_name[i]+';' for j in range(100000)])
    #flow = pd.concat([flow, name10 + f10, name10 + f15, name20 + f20, name20 + f25])
    local_flow = pd.concat([name10 + f10, name10 + f15, name20 + f20, name20 + f25])

    a10 = pd.read_csv(data_path + 'area_ground_truth_' + design_name + '_flow_10.csv', header = None)
    a15 = pd.read_csv(data_path + 'area_ground_truth_' + design_name + '_flow_15.csv', header = None)
    a20 = pd.read_csv(data_path + 'area_ground_truth_' + design_name + '_flow_20.csv', header = None)
    a25 = pd.read_csv(data_path + 'area_ground_truth_' + design_name + '_flow_25.csv', header = None)
    #area = pd.concat([area, a10, a15, a20, a25])
    local_area = pd.concat([a10, a15, a20, a25])

    local_lstm_area = pd.concat([local_flow, local_area], axis = 1)
    local_lstm_area = shuffle(local_lstm_area)
    local_lstm_area = local_lstm_area.reset_index(drop = True)
    area.append(local_lstm_area)

    d10 = pd.read_csv(data_path + 'delay_ground_truth_' + design_name + '_flow_10.csv', header = None)
    d15 = pd.read_csv(data_path + 'delay_ground_truth_' + design_name + '_flow_15.csv', header = None)
    d20 = pd.read_csv(data_path + 'delay_ground_truth_' + design_name + '_flow_20.csv', header = None)
    d25 = pd.read_csv(data_path + 'delay_ground_truth_' + design_name + '_flow_25.csv', header = None)
    #delay = pd.concat([delay, d10, d15, d20, d25])
    local_delay = pd.concat([d10, d15, d20, d25])

    local_lstm_delay = pd.concat([local_flow, local_delay], axis = 1)
    local_lstm_delay = shuffle(local_lstm_delay)
    local_lstm_delay = local_lstm_delay.reset_index(drop = True)
    delay.append(local_lstm_delay)
    #break

In [None]:
flow.columns = ['flow']
area.columns = ['area']
delay.columns = ['delay']

In [None]:
# concatenate to form training dataset
area_train = pd.DataFrame()
delay_train = pd.DataFrame()

area_rest = pd.DataFrame()
delay_rest = pd.DataFrame()

for i in range(6):
    area_train = pd.concat([area_train, area[i][:110000]])
    delay_train = pd.concat([delay_train, delay[i][:110000]])

    area_rest = pd.concat([area_rest, area[i][110000:]])
    delay_rest = pd.concat([delay_rest, delay[i][110000:]])

# for unseen design
for i in range(6,11):
    area_rest = pd.concat([area_rest, area[i]])
    delay_rest = pd.concat([delay_rest, delay[i]])

In [None]:
lstm_area = pd.concat([shuffle(area_train), shuffle(area_rest)])
lstm_delay = pd.concat([shuffle(delay_train), shuffle(delay_rest)])

In [None]:
lstm_area.columns = ['flow', 'area']
lstm_delay.columns = ['flow', 'delay']

In [None]:
lstm_area['area']=lstm_area['area'].divide(5000)
lstm_delay['delay']=lstm_delay['delay'].divide(5000)
print(lstm_area['area'].max())
print(lstm_delay['delay'].max())

In [None]:
# update the names in synthesis flows
lstm_delay['flow'] = lstm_delay['flow'].str.replace('resub -z','rsz')
lstm_delay['flow'] = lstm_delay['flow'].str.replace('resub','rs')

lstm_area['flow'] = lstm_area['flow'].str.replace('resub -z','rsz')
lstm_area['flow'] = lstm_area['flow'].str.replace('resub','rs')

In [None]:
# save the raw file
lstm_area.to_csv('lstm_area.csv',index=None)
lstm_delay.to_csv('lstm_delay.csv',index=None)

## Split into train, valid, and test

In [None]:
### split into train, valid, and test
from sklearn.model_selection import train_test_split

raw_data_path = 'lstm_area.csv'
destination_folder = 'data_area'

train_test_ratio = 0.25
train_valid_ratio = 0.80

first_n_words = 26

def trim_string(x):
    x = x.split(';',maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])
    return x

In [None]:
# Read raw data
df_raw = pd.read_csv(raw_data_path)

# Trim text and titletext to first_n_words
df_raw['flow'] = df_raw['flow'].apply(trim_string)

# Train-test split
df_full_train, df_test = train_test_split(df_raw, train_size = train_test_ratio, shuffle = False)

# Train-valid split
df_train, df_valid = train_test_split(df_full_train, train_size = train_valid_ratio, shuffle = False)

# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)