# Convert From The CDALab Format
This notebook is for internal use in our lab.

The data frame of the heterogeneous multivariate temporal data contains the variable ids and their values for each time point series at different timestamps.

The class of each series is represented by the variable id -1 and variable value of 1 or 0.
If the class is 1, an event of interest is created at the end of the entity.
Please, note that is not always correct to convert the class to an event of interest, only if the class represents an event that occurs at the end of the entity. 

**Input -** 

*INPUT_TEMPORAL_DB* contains the heterogeneous multivariate temporal data with these columns:

*   'EntityID' - series is the same as the entity
*   'TemporalPropertyID' - variable id 
*   'TimeStamp' - time 
*   'TemporalPropertyValue' - variable value

**Output -** 

*   *FINAL_INTERVAL_TRAIN* - contains a training set with STIs
*   *FINAL_INTERVAL_TEST* - same, for testing
*   *FINAL_KL_TRAIN* - same STIs for TIRPs discovery (the KarmaLego format)
*   *FINAL_RAW_TRAIN*  - contains a raw training set
*   *FINAL_RAW_TEST* - same, for testing
*   *FINAL_CLASS_TRAIN*  - contains the classes for the raw training set
*   *FINAL_CLASS_TEST* - same, for testing

# Import packages

In [36]:
import pandas as pd
import random
import csv

# CONSTANTS

In [60]:
# parameters
EVENT_OF_INTEREST_ID = 999.00
VAR_ID_FOR_CLASS = -1
SYMBOLS_PER_VARIABLE = 3
PREC_TO_TRAIN = 0.7

# file names
INPUT_TEMPORAL_DB = 'het_mul_tmp_conf_data.csv'
INPUT_PATTERNS = 'from_kl.csv'
FINAL_INTERVAL_TRAIN = 'sti_train.csv'
FINAL_INTERVAL_TEST = 'sti_test.csv'
FINAL_KL_TRAIN = 'to_kl.csv'
FINAL_RAW_TRAIN = 'raw_train.csv'
FINAL_RAW_TEST = 'raw_test.csv'
FINAL_CLASS_TRAIN = 'train_class.csv'
FINAL_CLASS_TEST = 'test_class.csv'
FINAL_PATTERNS_NAME = 'patterns.csv'

# column names 
OLD_ENTITY_COL_NAME = 'EntityID'
OLD_TEMPORAL_VAR_ID = 'TemporalPropertyID'
OLD_TEMPORAL_VAR_VAL = 'TemporalPropertyValue'
ENTITY_COL_NAME = 'SeriesID'
TEMPORAL_VAR_ID = 'VarID'
TEMPORAL_VAR_VAL = 'VarVal'
TIME_COL_NAME = 'TimeStamp'
SYMBOL_COL_NAME = 'Symbol'
SYMBOL_ID_COL_NAME = 'SymbolID'
INTERVAL_START_TIME = 'StartTime'
INTERVAL_END_TIME = 'EndTime'
CLASS_COL_NAME = 'Class'

In [3]:
df = pd.read_csv(INPUT_TEMPORAL_DB)
print(df.columns)
df.rename(columns = {OLD_ENTITY_COL_NAME:ENTITY_COL_NAME,
                     OLD_TEMPORAL_VAR_ID:TEMPORAL_VAR_ID,
                     OLD_TEMPORAL_VAR_VAL:TEMPORAL_VAR_VAL}, inplace = True)
df.head(5)

Index(['EntityID', 'TemporalPropertyID', 'TimeStamp', 'TemporalPropertyValue'], dtype='object')


Unnamed: 0,SeriesID,VarID,TimeStamp,VarVal
0,5,44,6,1.0
1,5,44,7,1.0
2,5,44,8,1.0
3,5,44,9,1.1
4,5,44,10,1.1


In [4]:
df.dtypes

SeriesID       int64
VarID          int64
TimeStamp      int64
VarVal       float64
dtype: object

get all unique series ids

In [5]:
series_id_set = df[ENTITY_COL_NAME].unique()
series_id_set

array([   5,    6,    8, ...,  626, 1872, 1297])

# Create an event and reset the series index
This code resets the series indexes. The first one gets 1 the second 2 and so on...

This code also takes the class of the series and creates an event at the maximum timestamp


In [6]:
series_index = 1 # used to reset the series indexes 
dict_to_df = {
  ENTITY_COL_NAME: [],
  TEMPORAL_VAR_ID: [],
  TIME_COL_NAME: [],
  TEMPORAL_VAR_VAL: []
}

for s_id in series_id_set:  # iterate over the different time point series
  seires_df = df[df[ENTITY_COL_NAME]==s_id]  # creates a df for this series
  max_ts = seires_df[TIME_COL_NAME].max()  # gets the maximum time stamp

  for index, row in seires_df.iterrows():  # iterates over the time points
    row_var_id = row[TEMPORAL_VAR_ID]
    row_var_val = row[TEMPORAL_VAR_VAL]
    if row_var_id == VAR_ID_FOR_CLASS:  # if it's the class variable
      if row_var_val == 1.0:  # if the class is positive
        dict_to_df[ENTITY_COL_NAME].append(series_index)
        dict_to_df[TEMPORAL_VAR_ID].append(EVENT_OF_INTEREST_ID)
        dict_to_df[TIME_COL_NAME].append(max_ts)
        dict_to_df[TEMPORAL_VAR_VAL].append(row[TEMPORAL_VAR_VAL])
    else:
      dict_to_df[ENTITY_COL_NAME].append(series_index)
      dict_to_df[TEMPORAL_VAR_ID].append(row_var_id)
      dict_to_df[TIME_COL_NAME].append(row[TIME_COL_NAME])
      dict_to_df[TEMPORAL_VAR_VAL].append(row[TEMPORAL_VAR_VAL])

  series_index += 1  # increase for the next series (i.e., entity)

df_with_event = pd.DataFrame.from_dict(dict_to_df)

In [7]:
df_with_event.head(5)

Unnamed: 0,SeriesID,VarID,TimeStamp,VarVal
0,1,44.0,6.0,1.0
1,1,44.0,7.0,1.0
2,1,44.0,8.0,1.0
3,1,44.0,9.0,1.1
4,1,44.0,10.0,1.1


gets all unique variables ids

In [8]:
var_id_set = df_with_event[TEMPORAL_VAR_ID].unique()
var_id_set

array([ 44.,   1.,   2.,   4.,   5.,   6.,  40.,  41.,  42.,  43.,  39.,
         3., 999.])

# Reset variable ID and State Abstraction with Equal Frequency discretization

In [9]:
var_index = 1
symbol_index = 1  # for the symbol id

dict_to_df = {
  ENTITY_COL_NAME: [],
  TEMPORAL_VAR_ID: [],
  TIME_COL_NAME: [],
  TEMPORAL_VAR_VAL: [],
  SYMBOL_COL_NAME: []
}

for v_id in var_id_set:  # iterate over the temporal variables
  var_df = df_with_event[df_with_event[TEMPORAL_VAR_ID]==v_id]   # create df from each variable
  if v_id != EVENT_OF_INTEREST_ID:  # state abstraction is not relevant for the event variable
    var_df[TEMPORAL_VAR_ID] = var_index
    var_index += 1
    var_df[SYMBOL_COL_NAME]= pd.qcut(x=var_df[TEMPORAL_VAR_VAL].rank(method='first'), q=SYMBOLS_PER_VARIABLE, labels=[symbol_index,symbol_index+1,symbol_index+2])
    symbol_index += SYMBOLS_PER_VARIABLE
  
  for index, row in var_df.iterrows():  # write the new df with symbols to the dict
    row_var_id = row[TEMPORAL_VAR_ID]
    row_var_val = row[TEMPORAL_VAR_VAL]
    dict_to_df[ENTITY_COL_NAME].append(row[ENTITY_COL_NAME])
    dict_to_df[TEMPORAL_VAR_ID].append(row_var_id)
    dict_to_df[TIME_COL_NAME].append(row[TIME_COL_NAME])
    dict_to_df[TEMPORAL_VAR_VAL].append(row[TEMPORAL_VAR_VAL])
    if v_id != EVENT_OF_INTEREST_ID:
      dict_to_df[SYMBOL_COL_NAME].append(row[SYMBOL_COL_NAME])
    else: 
      dict_to_df[SYMBOL_COL_NAME].append(EVENT_OF_INTEREST_ID)
df_with_symbols = pd.DataFrame.from_dict(dict_to_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df[TEMPORAL_VAR_ID] = var_index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df[SYMBOL_COL_NAME]= pd.qcut(x=var_df[TEMPORAL_VAR_VAL].rank(method='first'), q=SYMBOLS_PER_VARIABLE, labels=[symbol_index,symbol_index+1,symbol_index+2])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df[TEM

In [10]:
df_with_symbols.head(5)

Unnamed: 0,SeriesID,VarID,TimeStamp,VarVal,Symbol
0,1.0,1.0,6.0,1.0,3.0
1,1.0,1.0,7.0,1.0,3.0
2,1.0,1.0,8.0,1.0,3.0
3,1.0,1.0,9.0,1.1,3.0
4,1.0,1.0,10.0,1.1,3.0


# Create Raw Data
Standartize the data for each temporal variable and then merge the dataframes

In [11]:
var_id_set = df_with_symbols[TEMPORAL_VAR_ID].unique()
var_id_set

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12., 999.])

In [12]:
raw_df_with_symbols = df_with_symbols[[ENTITY_COL_NAME, TEMPORAL_VAR_ID, TIME_COL_NAME, TEMPORAL_VAR_VAL]]
new_raw_df = []
for v_id in var_id_set:
  raw_var_df = raw_df_with_symbols[raw_df_with_symbols[TEMPORAL_VAR_ID]==v_id]
  raw_var_df[TEMPORAL_VAR_VAL] = (raw_var_df[TEMPORAL_VAR_VAL] - raw_var_df[TEMPORAL_VAR_VAL].mean()) / raw_var_df[TEMPORAL_VAR_VAL].std()
  new_raw_df.append(raw_var_df)

merged_raw_df = pd.concat(new_raw_df)
merged_raw_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_var_df[TEMPORAL_VAR_VAL] = (raw_var_df[TEMPORAL_VAR_VAL] - raw_var_df[TEMPORAL_VAR_VAL].mean()) / raw_var_df[TEMPORAL_VAR_VAL].std()


Unnamed: 0,SeriesID,VarID,TimeStamp,VarVal
0,1.0,1.0,6.0,0.159086
1,1.0,1.0,7.0,0.159086
2,1.0,1.0,8.0,0.159086
3,1.0,1.0,9.0,0.393966
4,1.0,1.0,10.0,0.393966


In [13]:
symbol_set = df_with_symbols[SYMBOL_COL_NAME].unique()
symbol_set

array([  3.,   2.,   1.,   4.,   5.,   6.,   8.,   7.,   9.,  12.,  10.,
        11.,  13.,  15.,  14.,  18.,  16.,  17.,  20.,  21.,  19.,  23.,
        24.,  22.,  26.,  25.,  27.,  30.,  29.,  28.,  33.,  32.,  31.,
        36.,  34.,  35., 999.])

In [14]:
series_id_set_new = df_with_symbols[ENTITY_COL_NAME].unique()
series_id_set_new

array([1.000e+00, 2.000e+00, 3.000e+00, ..., 1.484e+03, 1.512e+03,
       1.563e+03])

# Concat to intervals
for each symbol, if the time interval is continuous it should be concatenated to an interval

In [15]:
dict_to_df = {
  ENTITY_COL_NAME: [],
  TEMPORAL_VAR_ID: [],
  SYMBOL_ID_COL_NAME: [],
  INTERVAL_START_TIME: [],
  INTERVAL_END_TIME: [],
}
for s_id in series_id_set_new:
  for sym_id in symbol_set:
    # for each symbol id of a series
    df_s_sym = df_with_symbols[(df_with_symbols[ENTITY_COL_NAME]==s_id) & (df_with_symbols[SYMBOL_COL_NAME]==sym_id)].sort_values(by=TIME_COL_NAME, ascending=True)

    if len(df_s_sym) == 0:  # no time points with this symbols
      continue

    start_interval_time = None
    prev_interval_time = None
    for index, row in df_s_sym.iterrows():  # concatination process
      curr_time = row[TIME_COL_NAME]
      if prev_interval_time == None:  # no interval exist yet
        start_interval_time = curr_time
        prev_interval_time = curr_time
      elif curr_time == prev_interval_time + 1:  # occuring interval
        prev_interval_time = curr_time
      else: # finished interval
        if start_interval_time == prev_interval_time:
          prev_interval_time += 1
        dict_to_df[ENTITY_COL_NAME].append(row[ENTITY_COL_NAME])
        dict_to_df[TEMPORAL_VAR_ID].append(row[TEMPORAL_VAR_ID])
        dict_to_df[SYMBOL_ID_COL_NAME].append(row[SYMBOL_COL_NAME])
        dict_to_df[INTERVAL_START_TIME].append(start_interval_time)
        dict_to_df[INTERVAL_END_TIME].append(prev_interval_time)
        
        # init again
        start_interval_time = curr_time
        prev_interval_time = curr_time

    # for the last point
    if start_interval_time == prev_interval_time:
      prev_interval_time += 1
    if start_interval_time == -1:
      break
    dict_to_df[ENTITY_COL_NAME].append(row[ENTITY_COL_NAME])
    dict_to_df[TEMPORAL_VAR_ID].append(row[TEMPORAL_VAR_ID])
    dict_to_df[SYMBOL_ID_COL_NAME].append(row[SYMBOL_COL_NAME])
    dict_to_df[INTERVAL_START_TIME].append(start_interval_time)
    dict_to_df[INTERVAL_END_TIME].append(prev_interval_time)

df_c = pd.DataFrame.from_dict(dict_to_df)

# Split to train and test

In [16]:
entities_with_event = df_c[df_c[SYMBOL_ID_COL_NAME]==EVENT_OF_INTEREST_ID][ENTITY_COL_NAME].unique()
entities_wihout_event = list(set(df_c[ENTITY_COL_NAME])-set(entities_with_event))

In [17]:
entities_with_event = sorted(entities_with_event, key = lambda x: random.random())
entities_wihout_event = sorted(entities_wihout_event, key = lambda x: random.random())

number of entities with and without the event

In [18]:
len(entities_with_event), len(entities_wihout_event)

(235, 1495)

entities with and without the event in the training set

In [19]:
with_event_to_train = int(len(entities_with_event) * PREC_TO_TRAIN)
without_event_to_train = int(len(entities_wihout_event) * PREC_TO_TRAIN)

with_event_to_train, without_event_to_train

(164, 1046)

define list of entities with and without events and create new train df and test df

In [20]:
train_entities = entities_with_event[:with_event_to_train] + entities_wihout_event[:without_event_to_train]
test_entities = entities_with_event[with_event_to_train:] + entities_wihout_event[without_event_to_train:]

In [21]:
train_df = df_c[df_c[ENTITY_COL_NAME].isin(train_entities)].reset_index(drop=True)
test_df = df_c[df_c[ENTITY_COL_NAME].isin(test_entities)].reset_index(drop=True)

In [22]:
train_df.to_csv(FINAL_INTERVAL_TRAIN, index=False)
test_df.to_csv(FINAL_INTERVAL_TEST, index=False)

# Write train to KarmaLego format
This script creates an input file for KarmaLego only from the entities with the event

In [23]:
train_entities_with_events = entities_with_event[:with_event_to_train]
train_with_event_df = df_c[df_c[ENTITY_COL_NAME].isin(train_entities_with_events)].reset_index(drop=True)

In [24]:
entities_num = len(set(train_with_event_df[ENTITY_COL_NAME]))
list_to_write = ['startToncepts', f'numberOfEntities,{entities_num}']
entities_set = set(train_with_event_df[ENTITY_COL_NAME])
for entity in entities_set:
    list_to_write.append(f'{int(entity)},{int(entity)};')
    entity_df = train_with_event_df[train_with_event_df[ENTITY_COL_NAME] == entity]
    entity_df = entity_df.sort_values(by=[INTERVAL_START_TIME,
                                          INTERVAL_END_TIME,
                                          SYMBOL_ID_COL_NAME])
    intervals_entity = ''
    for index, row in entity_df.iterrows():
        s_time = int(row[INTERVAL_START_TIME])
        e_time = int(row[INTERVAL_END_TIME])
        s_id = int(row[SYMBOL_ID_COL_NAME])
        p_id = int(row[TEMPORAL_VAR_ID])
        intervals_entity += f'{s_time},{e_time},{s_id},{p_id};'
    list_to_write.append(intervals_entity)
with open(FINAL_KL_TRAIN, 'w') as f:
    for item in list_to_write:
        f.write("%s\n" % item)

check there is no overlap in the series

In [25]:
set(train_df[ENTITY_COL_NAME].unique()).intersection(set(test_df[ENTITY_COL_NAME].unique()))

set()

# Write raw data with the class

In [26]:
class_df = merged_raw_df[merged_raw_df[TEMPORAL_VAR_ID]==EVENT_OF_INTEREST_ID]
no_class = merged_raw_df[merged_raw_df[TEMPORAL_VAR_ID]!=EVENT_OF_INTEREST_ID]
raw_train = no_class[no_class[ENTITY_COL_NAME].isin(train_entities)].reset_index(drop=True)
raw_test = no_class[no_class[ENTITY_COL_NAME].isin(test_entities)].reset_index(drop=True)
raw_train.to_csv(FINAL_RAW_TRAIN, index=False)
raw_test.to_csv(FINAL_RAW_TEST, index=False)

In [27]:
train_entities_with = entities_with_event[:with_event_to_train]
train_entities_without = entities_wihout_event[:without_event_to_train]
test_entities_with = entities_with_event[with_event_to_train:]
test_entities_without = entities_wihout_event[without_event_to_train:]

create the y classes for train and then test

In [28]:
dict_y_train = {
    ENTITY_COL_NAME: [],
    CLASS_COL_NAME: []
}
for t in train_entities_with:
  dict_y_train[ENTITY_COL_NAME].append(t)
  dict_y_train[CLASS_COL_NAME].append(1)
for t in train_entities_without:
  dict_y_train[ENTITY_COL_NAME].append(t)
  dict_y_train[CLASS_COL_NAME].append(0)

y_train_df= pd.DataFrame.from_dict(dict_y_train)
y_train_df.to_csv(FINAL_CLASS_TRAIN, index=False)

In [29]:
dict_y_test = {
    ENTITY_COL_NAME: [],
    CLASS_COL_NAME: []
}
for t in test_entities_with:
  dict_y_test[ENTITY_COL_NAME].append(t)
  dict_y_test[CLASS_COL_NAME].append(1)
for t in test_entities_without:
  dict_y_test[ENTITY_COL_NAME].append(t)
  dict_y_test[CLASS_COL_NAME].append(0)

y_test_df= pd.DataFrame.from_dict(dict_y_test)
y_test_df.to_csv(FINAL_CLASS_TEST, index=False)

# Create patterns files

In [58]:
pat_dict = {'STIs': [], 'TempRels': [], 'VerSupp': [], 'HorSupp': []}
tot_entities = None
frs_row = True
with open(INPUT_PATTERNS, 'r') as file:
  csvreader = csv.reader(file)
  for row in csvreader:
    if frs_row:
      tot_entities = int(row[0].split(';')[-1].split('=')[-1])
      print(tot_entities)
      frs_row = False
    else:
      print(row)
      row_val = row[0].split(' ')
      stis = row_val[1].split('-')[:-1]
      pat_dict['STIs'].append(stis)
      rels = row_val[2].split('.')[:-1]
      pat_dict['TempRels'].append(rels)
      vs = int(row_val[3])/tot_entities
      pat_dict['VerSupp'].append(vs)
      hs = float(row_val[4])
      pat_dict['HorSupp'].append(hs)

164
['1 1-  82 1.2317073170731707 1538 [14-29] 1541 [25-29] 1550 [24-32] 1553 [10-32] 1556 [13-16] 1556 [23-29] 1558 [5-33] 1559 [12-33] 1560 [1-33] 1561 [7-34] 1562 [25-34] 1564 [25-34] 1565 [3-34] 1568 [35-36] 1569 [13-35] 1573 [19-20] 1573 [26-32] 1573 [36-37] 1577 [24-31] 1579 [5-37] 1582 [27-37] 1591 [15-30] 1594 [24-27] 1596 [13-25] 1596 [29-39] 1606 [38-42] 1608 [32-42] 1611 [8-9] 1611 [13-20] 1611 [22-42] 1616 [13-19] 1616 [27-31] 1616 [36-41] 1617 [20-28] 1620 [11-12] 1620 [25-28] 1620 [37-44] 1621 [8-26] 1621 [29-44] 1623 [8-24] 1623 [31-44] 1624 [37-44] 1625 [36-44] 1626 [12-37] 1628 [20-24] 1631 [25-45] 1632 [17-28] 1632 [42-45] 1639 [3-46] 1640 [5-46] 1641 [15-46] 1643 [26-47] 1646 [19-47] 1649 [13-29] 1650 [2-48] 1651 [10-48] 1653 [25-48] 1654 [12-49] 1655 [36-49] 1656 [5-49] 1660 [18-50] 1669 [27-33] 1672 [8-53] 1673 [11-32] 1674 [5-53] 1675 [3-53] 1684 [12-34] 1685 [34-42] 1687 [11-12] 1688 [38-57] 1690 [24-57] 1693 [10-58] 1694 [12-58] 1695 [40-58] 1699 [26-56] 1700 [2

In [61]:
pat_df = pd.DataFrame.from_dict(pat_dict)
pat_df.to_csv(FINAL_PATTERNS_NAME, index=False)
pat_df

Unnamed: 0,STIs,TempRels,VerSupp,HorSupp
0,[1],[],0.500000,1.231707
1,[2],[],0.652439,1.532710
2,[3],[],0.554878,1.406593
3,[4],[],0.457317,2.946667
4,[5],[],0.579268,4.242105
...,...,...,...,...
387,"[33, 27, 999]","[<, <, m]",0.274390,1.355556
388,"[33, 27]",[f],0.219512,1.000000
389,"[33, 30]",[<],0.207317,1.323529
390,"[33, 999]",[<],0.445122,1.315068
