In [22]:
# If restarting the script, ensure Runtime is set to "TPU".
import sys
from os import path
from multiprocessing import Pool
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde
from google.colab import files
import time as time

In [23]:
files.upload()

Saving original.csv to original (1).csv
Saving paramX1wrtTime.csv to paramX1wrtTime (1).csv
Saving paramX2wrtTime.csv to paramX2wrtTime (1).csv


{'original.csv': b'paramX1,paramX2\r\n199.84,20.4856\r\n199.14,20.48414545\r\n199.96,20.48269091\r\n200.14,20.48123636\r\n199.94,20.4808\r\n199.66,20.47973019\r\n200.1,20.47820189\r\n199.96,20.47667358\r\n200.59,20.4754\r\n199.73,20.47474848\r\n200.61,20.47083939\r\n200.98,20.4669303\r\n200.27,20.46302121\r\n200.01,20.4635\r\n200.63,20.46801132\r\n200.61,20.47283208\r\n199.18,20.47765283\r\n199,20.4842\r\n200.06,20.49264\r\n199.88,20.49796\r\n199.31,20.50328\r\n199.85,20.5086\r\n200.01,20.51497736\r\n',
 'paramX1wrtTime.csv': b'TimeStamp,paramX1\r\n06/08/2015 12:33:00,199.84\r\n06/08/2015 12:33:30,199.14\r\n06/08/2015 12:34:00,199.96\r\n06/08/2015 12:34:30,200.14\r\n06/08/2015 12:35:00,199.94\r\n06/08/2015 12:35:30,199.66\r\n06/08/2015 12:36:00,200.1\r\n06/08/2015 12:36:30,199.96\r\n06/08/2015 12:37:00,200.59\r\n06/08/2015 12:37:30,199.73\r\n06/08/2015 12:38:00,200.61\r\n06/08/2015 12:38:30,200.98\r\n06/08/2015 12:39:00,200.27\r\n06/08/2015 12:39:30,200.01\r\n06/08/2015 12:40:00,200.63

In [24]:
# Task 1 - Define a function to compute the rate of change of a time series data
def find_rate(data=None):
    
    if data is None:
      print("Error. Please pass time-dependent data.")
      return None
    try:
      index = data.iloc[:,0].values.astype(np.int64)//10**9
      data['index_col'] = index
      values = pd.Series(data.iloc[:,1].values, index=index)
      rate = values.diff()/(data['index_col'].diff().values)
      data['rate'] = rate.values
      data.fillna(0, inplace=True)
      data['rate'] = data['rate'].round(5)
    except AttributeError as error:
      print("Attribute error:" + error)
    except:
      print("Unexpected error:" + sys.exc_info()[0])
      raise


In [25]:
# Task 2 - Calculate rate of time-dependent parameters
# apply the above function on time series data files
FILENAMES = ["paramX1wrtTime.csv",
             "paramX2wrtTime.csv", # additional data file (add your own)
             ]

for file in FILENAMES:
    # split the file path to get the parameter name
    name = file.split('wrt')

    # print parameter name
    print(name[0])

    # get full file path
    filepath = path.join("./", file)

    # open the file
    data_file = pd.read_csv(filepath, header=0, index_col=False)

    print(data_file.dtypes)
    data_file.iloc[:,0] = pd.to_datetime(data_file.iloc[:,0])
    print(data_file.dtypes)
    find_rate(data_file)
    print(data_file.head(5))
    data_file.drop(data_file.columns[[0,2]], axis=1, inplace=True)
    out_path = path.join(path.join("./"), name[0] + "_rate.csv")
    data_file.to_csv(out_path)


paramX1
TimeStamp     object
paramX1      float64
dtype: object
TimeStamp    datetime64[ns]
paramX1             float64
dtype: object
            TimeStamp  paramX1   index_col     rate
0 2015-06-08 12:33:00   199.84  1433766780  0.00000
1 2015-06-08 12:33:30   199.14  1433766810 -0.02333
2 2015-06-08 12:34:00   199.96  1433766840  0.02733
3 2015-06-08 12:34:30   200.14  1433766870  0.00600
4 2015-06-08 12:35:00   199.94  1433766900 -0.00667
paramX2
TimeStamp     object
paramX2      float64
dtype: object
TimeStamp    datetime64[ns]
paramX2             float64
dtype: object
            TimeStamp    paramX2   index_col     rate
0 2015-06-08 12:33:00  20.485600  1433766780  0.00000
1 2015-06-08 12:33:30  20.484145  1433766810 -0.00005
2 2015-06-08 12:34:00  20.482691  1433766840 -0.00005
3 2015-06-08 12:34:30  20.481236  1433766870 -0.00005
4 2015-06-08 12:35:00  20.480800  1433766900 -0.00001


In [26]:
!ls

'original (1).csv'	   paramX1wrtTime.csv	     sample_data
 original.csv		   paramX2_rate.csv	     simulated_rate.csv
 paramX1_rate.csv	  'paramX2wrtTime (1).csv'   simulation_final_all.csv
'paramX1wrtTime (1).csv'   paramX2wrtTime.csv


In [27]:
# Task 3 - Generate rate samples for each column
# simulate new values of rates using Pool's starmap function
filenames = ["paramX1_rate.csv",
             "paramX2_rate.csv", # add more files here
            ] 

# initialize variables
random_rate = []
samples = 10000 # number of rate samples required
rate_df = pd.DataFrame()
pool = Pool(processes=1) # define number of parallel processes required

# Generate time series
for file in filenames:

    # split filename to get parameter name
    name = file.split('_')

    # print parameter name
    print(name[0])

    # get full file path
    filepath = path.join("./", file)

    # read file
    data_file = pd.read_csv(filepath, header=0, index_col=False)

    # convert first column to 'datetimens' datatype
    data_file.iloc[:,0] = pd.to_datetime(data_file.iloc[:,0])

    # store 'rate' column as 'data'
    data = data_file['rate']

    # get random samples
    values = np.random.rand(samples)

    x_grid = pool.starmap(np.linspace, ((min(data), max(data), samples),))
    kde = pool.starmap(gaussian_kde, ((data, "scott"),))
    kdepdf = kde[0].evaluate(x_grid[0])
    cdf = np.cumsum(kdepdf)
    cdf = cdf/cdf[-1]
    value_bins = pool.starmap(np.searchsorted, ((cdf, values),))
    random_rate = x_grid[0][tuple(value_bins)]
    temp_df = pd.DataFrame(np.asarray(random_rate).T.tolist(),
                           columns=[name[0]])
    rate_df = rate_df.join(temp_df, how='outer')

pool.terminate() # stop all parallel processes

# write 'rate_df' to file
rate_df.to_csv("simulated_rate.csv", index=False)


paramX1
paramX2


In [28]:
# Tasks 4-5 - Define a function to simulate parameter values
def simulate_corr(temp):
    """
    Simulate original dataset, using rates and correlation. Pick a random
    column (X), simulate other column values based on rate of change of X and
    correlation between X and other columns. Repeat for samples count.
    """
    global sdev, rate, orig_df, freq, samples
    
    data = orig_df
    colRange = len(data.columns) # number of columns
    start_vals = data.mean() # get mean
    data_corr = data.corr() # get correlation between all columns
    colms = data.columns.values.tolist() # list of columns
    temp_df = pd.DataFrame(columns=colms) # empty df intialized with columns

    for i in range(samples):

      if (i+1) % 500 == 0:
        print("Simulated", (i+1), "of", samples, " samples in chunk:", temp)
      index = int(np.random.randint(colRange, size=1))
      randomCol = colms[index]
      end_total = list()
      newrate = 0

      for col in colms:
        start = start_vals[col]
        corr = data_corr[randomCol][col]
        delta = rate[col].sample(1).values
        nextValue = start + (sdev[col]/sdev[randomCol])*corr*delta*freq
        end_total.append(nextValue)
        start_vals[col] = nextValue

      temp_df = temp_df.append(pd.DataFrame(np.asarray(end_total).T, columns=colms),
                               ignore_index=True)
    temp_df = temp_df.round(3)
    return temp_df


In [29]:
# Task 6 - Initialize variables to start the parallel simulation
rate = pd.read_csv("./simulated_rate.csv", header=0, index_col=False)
print("Read rate file\n")
concat_df, backup_df = pd.DataFrame(), pd.DataFrame()
orig_df = pd.read_csv("./original.csv", index_col=False)
print("Read original df\n")

sdev = orig_df.std()
runs = list(range(1,11))
samples = 1000
freq = 30


Read rate file

Read original df



In [30]:
# Task 7 - Initiate and track the simulation using 2 parallel processes
start_time = time.time()
print("Simulating...\n")
pool = Pool(2)
sim_df = pool.map(simulate_corr, runs, chunksize=5)
pool.terminate()
print("Pooling took", time.time() - start_time, "sec to run")


Simulating...

Simulated 500 of 1000  samples in chunk: 1
Simulated 500 of 1000  samples in chunk: 6
Simulated 1000 of 1000  samples in chunk: 6
Simulated 1000 of 1000  samples in chunk: 1
Simulated 500 of 1000  samples in chunk: 7
Simulated 500 of 1000  samples in chunk: 2
Simulated 1000 of 1000  samples in chunk: 7
Simulated 1000 of 1000  samples in chunk: 2
Simulated 500 of 1000  samples in chunk: 8
Simulated 500 of 1000  samples in chunk: 3
Simulated 1000 of 1000  samples in chunk: 8
Simulated 1000 of 1000  samples in chunk: 3
Simulated 500 of 1000  samples in chunk: 9
Simulated 500 of 1000  samples in chunk: 4
Simulated 1000 of 1000  samples in chunk: 9
Simulated 1000 of 1000  samples in chunk: 4
Simulated 500 of 1000  samples in chunk: 10
Simulated 500 of 1000  samples in chunk: 5
Simulated 1000 of 1000  samples in chunk: 10
Simulated 1000 of 1000  samples in chunk: 5
Pooling took 11.059087991714478 sec to run


In [31]:
# Task 8 - Create the final dataframe containing a time column
concat_df = pd.concat(sim_df)
concat_df.reset_index(drop=True, inplace=True)
print("Total number of samples = ", len(concat_df))
time_range = pd.DataFrame(pd.date_range('1/1/1700 12:00:00', periods=len(concat_df)+1,
                                        freq='30S'), columns=['Time'])
concat_df['Time'] = time_range['Time']
concat_df.set_index('Time', inplace=True)
concat_df.to_csv('simulation_final_all.csv')


Total number of samples =  10000


In [32]:
concat_df.head(10)

Unnamed: 0_level_0,paramX1,paramX2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1
1700-01-01 12:00:00,193.258,20.479
1700-01-01 12:00:30,194.498,20.484
1700-01-01 12:01:00,194.442,20.484
1700-01-01 12:01:30,193.98,20.484
1700-01-01 12:02:00,187.533,20.49
1700-01-01 12:02:30,187.481,20.49
1700-01-01 12:03:00,195.595,20.488
1700-01-01 12:03:30,199.934,20.493
1700-01-01 12:04:00,202.044,20.491
1700-01-01 12:04:30,198.734,20.493
