## Import packages

In [None]:
!pip install sdv
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.evaluation import evaluate
import pandas as pd
import tensorflow as tf

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sdv
  Downloading sdv-0.15.0-py2.py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 4.4 MB/s 
Collecting graphviz<1,>=0.13.2
  Downloading graphviz-0.20-py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 4.5 MB/s 
Collecting deepecho<0.4,>=0.3.0.post1
  Downloading deepecho-0.3.0.post1-py2.py3-none-any.whl (26 kB)
Collecting copulas<0.8,>=0.7.0
  Downloading copulas-0.7.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
Collecting rdt<0.7,>=0.6.2
  Downloading rdt-0.6.4-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 3.8 MB/s 
[?25hCollecting Faker<10,>=3.0.0
  Downloading Faker-9.9.1-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 32.8 MB/s 
[?25hCollecting ctgan<0.6,>=0.5.1
  Downloading ctgan-0.5.1-py2.py3-none-any.whl (24 kB)
Col

## Load the data from CSV

In [None]:
def load_dataset(benchmark, algo):
  df_orig=pd.read_csv(drive_location+algo+".csv")
  #print(df_orig)
  #df_selected_col = df_orig.drop(columns=['sys','sysname','arch','PS','executable'])
  if (benchmark == "SPEC2006" or benchmark == "SPEC2017"):
    df_selected_col=df_orig.drop(columns=['arch','ld_shared_by_cores','l2_shared_by_cores','no_of_threads','system_name','bus_speed_qpi','bus_speed_dmi','l1_ins_assoc','l1_data_assoc','l2_assoc','l3_assoc','raw_bus_speed','converted_bus_speed','ddr_type','runtime'])
  elif (benchmark == "NPB"):
    df_selected_col=df_orig.drop(columns=['sys','sysname','arch','l1d_assoc','l1d_cache_lines','l1d_shared_by_threads','l2_assoc','l2_cache_lines','l2_shared_by_threads','l3_assoc','l3_cache_lines','l3_shared_by_threads','PS','runtime','executable','system'])
  else:
    df_selected_col=df_orig.drop(columns=['sys','arch','l1d_assoc','l1d_cache_lines','l1d_shared_by_threads','l2_assoc','l2_cache_lines','l2_shared_by_threads','l3_assoc','l3_cache_lines','l3_shared_by_threads','runtime'])
  #print(df_selected_col)
  df_selected_col.dropna(inplace=True)
  #print(df_selected_col)
  return df_selected_col

## Implementation of Vanilla GAN (Generator and Descriminitor)

In [None]:
def make_generator(gentype='vanila'):
  if gentype == 'vanila':
    return vanila_model()
  elif gen_type == 'ctgan':
    return
    

In [None]:
def vanila_model():
    vanila_model = tf.keras.Sequential()
    vanila_model.add(tf.keras.layers.Dense(7,activation='relu',use_bias=False,input_dim=df.shape[1]*2))
    vanila_model.add(tf.keras.layers.Dense(7,activation='relu',use_bias=False))
    vanila_model.add(tf.keras.layers.Dense(7,activation='relu',use_bias=False))
    vanila_model.add(tf.keras.layers.Dense(df.shape[1],activation='sigmoid',use_bias=False))
    return vanila_model

##TVAE 

In [None]:
def tvae_generator(tvae_model):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  tvae_model.fit(real_data)
  fake_data = tvae_model.sample(100)
  return tvae_model, fake_data

In [None]:
def evolutionary_tvae(algo):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  tvae_model, fake_data = tvae_generator(TVAE())
  #print(evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'], aggregate=False))
  score = evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'])
  prev_score = 0
  fake_data.to_csv(drive_location+algo+"_tvae_fake_data.csv",index=False)
  print("copulagan fake vs real score=",score," prev_score=",prev_score)
  '''
  while prev_score < score:
    prev_model = tvae_model
    tvae_model, synthetic_data = tvae_generator(tvae_model, real_data)
    prev_score = score
    score = evaluate(synthetic_data, real_data, metrics=['CSTest', 'KSTest'])
    print("score=",score," prev_score=",prev_score)
  '''

##CopulaGAN

In [None]:
def copulagan_generator(copulagan_model):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  copulagan_model.fit(real_data)
  fake_data = copulagan_model.sample(100)
  return copulagan_model, fake_data

In [None]:
def evolutionary_copulagan(algo):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  copulagan_model, fake_data = copulagan_generator(GaussianCopula())
  #print(evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'], aggregate=False))
  score = evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'])
  prev_score = 0
  fake_data.to_csv(drive_location+algo+"_copulagan_fake_data.csv",index=False)
  print("copulagan fake vs real score=",score," prev_score=",prev_score)
  '''
  while prev_score < score:
    prev_model = copulagan_model
    copulagan_model, synthetic_data = copulagan_generator(copulagan_model, real_data)
    prev_score = score
    score = evaluate(synthetic_data, real_data, metrics=['CSTest', 'KSTest'])
    print("score=",score," prev_score=",prev_score)
  '''

##GaussianCopula

In [None]:
def gaussiancopula_generator(gaussiancopula_model):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  gaussiancopula_model.fit(real_data)
  fake_data = gaussiancopula_model.sample(100)
  return gaussiancopula_model, fake_data

In [None]:
def evolutionary_gaussiancopula(algo):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  gaussiancopula_model, fake_data = gaussiancopula_generator(GaussianCopula())
  #print(evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'], aggregate=False))
  score = evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'])
  prev_score = 0
  fake_data.to_csv(drive_location+algo+"_gaussiancopula_fake_data.csv",index=False)
  print("gaussiancopula fake vs real score=",score," prev_score=",prev_score)
  '''
  while prev_score < score:
    prev_model = gaussiancopula_model
    gaussiancopula_model, synthetic_data = gaussiancopula_generator(gaussiancopula_model, real_data)
    prev_score = score
    score = evaluate(synthetic_data, real_data, metrics=['CSTest', 'KSTest'])
    print("score=",score," prev_score=",prev_score)
  '''

## CTGAN

In [None]:
def ctgan_generator(ctgan_model):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  ctgan_model.fit(real_data)
  fake_data = ctgan_model.sample(100)
  return ctgan_model, fake_data

In [None]:
def evolutionary_ctgan(algo):
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  ctgan_model, fake_data = ctgan_generator(CTGAN())
  #print(evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'], aggregate=False))
  score = evaluate(fake_data, real_data, metrics=['CSTest', 'KSTest'])
  prev_score = 0
  fake_data.to_csv(drive_location+algo+"_ctgan_fake_data.csv",index=False)
  print("CTGAN fake vs real score=",score," prev_score=",prev_score)
  '''
  while prev_score < score:
    prev_model = ctgan_model
    ctgan_model, synthetic_data = ctgan_generator(ctgan_model, real_data)
    prev_score = score
    score = evaluate(synthetic_data, real_data, metrics=['CSTest', 'KSTest'])
    print("score=",score," prev_score=",prev_score)
  '''

## Common Function to Genrate Synthetic Data from Fake Data

In [None]:
def generate_synthetic_from_fake(algo, gan_name):
  fake_data = pd.read_csv(drive_location+algo+"_"+gan_name+"_fake_data.csv")
  real_data = pd.read_csv(drive_location+algo+"_real_data.csv")
  synthetic_data = fake_data.copy()
  cols = fake_data.columns
  #print(cols)
  for index, row in fake_data.iterrows():
    for col in cols:
      if not(col == 'isa' or col == 'mem_type'):
        result_index = real_data[col].sub(row[col]).abs().idxmin()
        synthetic_data.loc[index, col] = real_data.loc[result_index, col]
        #print('row[col]=',row[col], 'synthetic_data.loc[index, col]=', synthetic_data.loc[index, col], 'real_data.loc[result_index, col]=', real_data.loc[result_index, col])
  synthetic_data.to_csv(drive_location+algo+"_"+gan_name+"_synthetic_data.csv",index=False)
  print(gan_name,' synthetic vs real score',evaluate(synthetic_data, real_data, metrics=['CSTest', 'KSTest']))

## Main Code

In [None]:
drive_location = r"/content/drive/MyDrive/Summer_Internship/CodeFiles/"

# Mantevo Suite
#benchmark = "Mantevo"
#algo="mantevo_miniFE"
#algo_fname="runtimes_final_mantevo_miniFE"
# NPB Suite
#benchmark = "NPB"
#algo = "npb_ep"
#algo_fname="runtimes_final_npb_ep"
#benchmark = "NPB"
#algo = "npb_mg"
#algo_fname="runtimes_final_npb_mg"
# SPEC 2006 Float
#benchmark = "SPEC2006"
#algo = "leslie3d"
#algo_fname = "437.leslie3d"
# SPEC 2006 Int
#benchmark = "SPEC2006"
#algo = "perlbench"
#algo_fname = "400.perlbench"
benchmark = "SPEC2017"
algo = "603.bwaves_s"
algo_fname = "603.bwaves_s"

spec_float_benchmark_list = ["SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006"]
spec_float_algo_list = ["603.bwaves_s","607.cactuBSSN_s","619.lbm_s","621.wrf_s","627.cam4_s","628.pop2_s","638.imagick_s","644.nab_s","649.fotonik3d_s","654.roms_s","416.gamess","433.milc","434.zeusmp","435.gromacs","437.leslie3d","447.dealII","450.soplex","453.povray","454.calculix"]
spec_float_algo_fname_list = ["603.bwaves_s","607.cactuBSSN_s","619.lbm_s","621.wrf_s","627.cam4_s","628.pop2_s","638.imagick_s","644.nab_s","649.fotonik3d_s","654.roms_s","416.gamess","433.milc","434.zeusmp","435.gromacs","437.leslie3d","447.dealII","450.soplex","453.povray","454.calculix"]

spec_int_benchmark_list = ["SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2017","SPEC2006","SPEC2006","SPEC2006","SPEC2006","SPEC2006"]
spec_int_algo_list = ["600.perlbench_s","602.gcc_s","605.mcf_s","620.omnetpp_s","623.xalancbmk_s","625.x264_s","631.deepsjeng_s","641.leela_s","648.exchange2_s","657.xz_s","401.bzip2","456.hmmer","458.sjeng","462.libquantum","473.astar"]
spec_int_algo_fname_list = ["600.perlbench_s","602.gcc_s","605.mcf_s","620.omnetpp_s","623.xalancbmk_s","625.x264_s","631.deepsjeng_s","641.leela_s","648.exchange2_s","657.xz_s","401.bzip2","456.hmmer","458.sjeng","462.libquantum","473.astar"]

npb_other_benchmark_list = ["NPB","NPB","NPB","NPB","OTHER","OTHER","OTHER"]
npb_other_algo_list = ["npb_ep","npb_mg","npb_sp","npb_sp-mz","matmul","montecarlo","quicksort"]
npb_other_algo_fname_list = ["npb_ep","npb_mg","npb_sp","npb_sp-mz","matmul","montecarlo","quicksort"]


# Assign variable with which list is to be processed
benchmark_list = npb_other_benchmark_list
algo_list = npb_other_algo_list
algo_fname_list = npb_other_algo_fname_list
#for algo_idx, algo in enumerate(algo_list):

pd.set_option("display.max_columns", None)
# Load and preprocess real dataset
real_data = load_dataset(benchmark, algo)
real_data.to_csv(drive_location+algo+"_real_data.csv",index=False)

# Generate fake data using GaussianCopula
evolutionary_gaussiancopula(algo)
# Convert fake data into synthetic data (To generate synthetic data we find a value in real data near to each fake data feature value and replace the fake data value to value from real data)
generate_synthetic_from_fake(algo,'gaussiancopula')

# Generate fake data using CTGAN
evolutionary_ctgan(algo)
# Convert fake data into synthetic data (To generate synthetic data we find a value in real data near to each fake data feature value and replace the fake data value to value from real data)
generate_synthetic_from_fake(algo,'ctgan')

# Generate fake data using CopulaGAN
evolutionary_copulagan(algo)
# Convert fake data into synthetic data (To generate synthetic data we find a value in real data near to each fake data feature value and replace the fake data value to value from real data)
generate_synthetic_from_fake(algo,'copulagan')

# Generate fake data using CopulaGAN
evolutionary_tvae(algo)
# Convert fake data into synthetic data (To generate synthetic data we find a value in real data near to each fake data feature value and replace the fake data value to value from real data)
generate_synthetic_from_fake(algo,'tvae')

# Vanilla GAN
#generator_model = make_generator(gentype='vanila')
#print(generator_model)

#noise = tf.random.normal([1, 42])
#generated_data = generator_model(noise, training=False)
#print(generated_data)


gaussiancopula fake vs real score= 0.8089085125814354  prev_score= 0
gaussiancopula  synthetic vs real score 0.9164052756371108


  .fit(X)
  .fit(X)
  .fit(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten(

CTGAN fake vs real score= 0.8915900766999835  prev_score= 0
ctgan  synthetic vs real score 0.9359577935752532
copulagan fake vs real score= 0.8031908543418569  prev_score= 0
copulagan  synthetic vs real score 0.9072607723392673


  .fit(X)
  .fit(X)
  .fit(X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten(

copulagan fake vs real score= 0.9182521563937891  prev_score= 0
tvae  synthetic vs real score 0.9402935892811435


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
