In [1]:
import numpy as np
import pandas as pd
import itertools
import pickle as pickle
from collections import Counter

In [2]:
raw_case_description = pd.read_csv('data/case_vectors.csv', delimiter=';', index_col = 'Case Number')
raw_case_description.head()

Unnamed: 0_level_0,Skill:Reach,Skill:Fixate,Skill:Push,Skill:Slide,Skill:Pick&Place,Skill:Orient,Condition:Moving_Target,Condition:With_Collision,Condition:Keep_in_Position,Action:Continious,Action:Discrete,Observation:Scalar,Observation:Visual(2D),Condition:Not_fully_observable)
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,0,0,0,0,1,0,0,4,0,26,0,0
2,1,0,0,0,0,0,0,0,0,0,2,1,0,0
3,0,1,0,0,0,0,0,0,1,2,0,8,0,0
4,1,0,0,0,0,0,0,0,0,0,4,0,1,0
5,1,0,1,0,0,0,0,0,0,0,6,70,1,0


In [3]:
# Extract all column names of the dataframe
col_name = list(raw_case_description.columns)

# Generating integer values starting from 5 to assign to all col_names
name_int = list(range(5, 5 + len(raw_case_description.columns)))

# Assigning all col_names an integer value from name_int
col_values_dict = dict(zip(col_name, name_int))
col_values_dict # This is the word index

{'Skill:Reach': 5,
 'Skill:Fixate': 6,
 'Skill:Push': 7,
 'Skill:Slide': 8,
 'Skill:Pick&Place': 9,
 'Skill:Orient': 10,
 'Condition:Moving_Target': 11,
 'Condition:With_Collision': 12,
 'Condition:Keep_in_Position': 13,
 'Action:Continious': 14,
 'Action:Discrete': 15,
 'Observation:Scalar': 16,
 'Observation:Visual(2D)': 17,
 'Condition:Not_fully_observable)': 18}

In [4]:
# Replacing the column names with their assigned numbers
case_description_col_values = raw_case_description.rename(col_values_dict, axis=1)
case_description_col_values.head()

Unnamed: 0_level_0,5,6,7,8,9,10,11,12,13,14,15,16,17,18
Case Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,0,0,0,0,1,0,0,4,0,26,0,0
2,1,0,0,0,0,0,0,0,0,0,2,1,0,0
3,0,1,0,0,0,0,0,0,1,2,0,8,0,0
4,1,0,0,0,0,0,0,0,0,0,4,0,1,0
5,1,0,1,0,0,0,0,0,0,0,6,70,1,0


In [5]:
# Converting dataframe to an array
df_values = case_description_col_values.values

# Transposing df_values to multiply the assigned name_int for each col_names
transposed_df_values = df_values.T
transposed_df_values

array([[ 1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  0,  1,  1,
         0,  1,  1,  1,  1,  1,  1,  0,  0,  1,  0,  1,  0,  1,  1,  1,
         0,  0,  0,  1,  1,  1,  1,  0,  0,  1],
       [ 0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,
         1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
         0,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,
         1,  0,  0,  0,  0,  0,  0,  1,  1,  0],
       [ 0,  0,  0,  0,  0,  0,  0,

In [6]:
def replace_list_elements(array_2D):
    '''
    takes in a 2D array and replaces each non-zero element of a list with an integer as assigned in col_values_dict
    '''
    
    m = 5
    replaced_element_list = []
    
    for i in array_2D:
        replaced_element_list.append(np.where(i!=0, m, i))
        m+=1
    
    return replaced_element_list

In [7]:
# Apply the replace_list_elements functiona and transpose back to original form
replaced_integer_values = replace_list_elements(transposed_df_values)
new_df_values = np.array(replaced_integer_values).T

In [8]:
def repeat_elem(list_1, list_2):
    
    '''
    Input = two lists (list_1, list_2)
    Output = a list that has repeated elements of list_1 as many times as the corresponding number in list_2
    '''

    return list(itertools.chain(*(itertools.repeat(elem, n) for elem, n in zip(list_1, list_2))))

In [9]:
def create_repeated_list(list_of_lists, repeat_ref):
    
    '''
    takes in a list of lists and recreates lists with repeating elements as many times as the corresponding values
    in the reference list
    
    The output does not contain any zeros as they are repeated zero times
    
    '''
    repeated_list = []
    
    for i, j in zip(list_of_lists, repeat_ref):
        repeated_list.append(repeat_elem(i, j))
        
    return repeated_list

In [10]:
repeated_df_values = create_repeated_list(new_df_values, df_values)

In [11]:
# Put all repeated_df_values in a dataframe
encoded_df = pd.DataFrame(repeated_df_values).fillna(0)
encoded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,74,75,76,77,78,79,80,81,82,83
0,5,11,14,14,14.0,14.0,16.0,16.0,16.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,15,15,16,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,13,14,14,16.0,16.0,16.0,16.0,16.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,15,15,15,15.0,17.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,7,15,15,15.0,15.0,15.0,15.0,16.0,16.0,...,16.0,16.0,16.0,16.0,17.0,0.0,0.0,0.0,0.0,0.0


In [12]:
encoded_cases = encoded_df.iloc[:,:].values
print(np.shape(encoded_cases))

(42, 84)


In [13]:
encoded_cases_int = []

for i in encoded_cases:
    encoded_cases_int.append(i.astype(int))

np.shape(encoded_cases_int)

(42, 84)

In [14]:
#Creating a dictionary where each key is the case no. and the key value is the respective case encoding
encoded_case_vector_dict = {}
for x in range(len(encoded_cases_int)):
    encoded_case_vector_dict["case_{0}".format(x+1)] = encoded_cases_int[x]

In [15]:
all_case_vectors_padded = encoded_case_vector_dict.values()

In [16]:
# case_dict.items() to get a list of keys and values of case pairs
case_vector_pairs_items = list(itertools.combinations_with_replacement(encoded_case_vector_dict.items(), 2))

# case_dict.keys() to get a list of keys of case pairs
case_vector_pairs_keys = list(itertools.combinations_with_replacement(encoded_case_vector_dict.keys(), 2))

# case_dict.values() to get a list of values of case pairs
case_vector_pairs_values = list(itertools.combinations_with_replacement(encoded_case_vector_dict.values(), 2))

In [17]:
print(len(case_vector_pairs_values))

903


In [18]:
case_vector_pairs_keys[0:5]

[('case_1', 'case_1'),
 ('case_1', 'case_2'),
 ('case_1', 'case_3'),
 ('case_1', 'case_4'),
 ('case_1', 'case_5')]

In [19]:
# Encoded case_vector_pairs_dataset where Input 1 is the first case of the case pair and 
# Input 2 is the second case of the case pair

case_vector_pair_dataset = pd.DataFrame(case_vector_pairs_values, 
                                        columns = ['Input 1','Input 2'], 
                                        index = [case_vector_pairs_keys])

case_vector_pair_dataset.index.names = ["Case Pairs"]

In [20]:
# Storing all variables to be used in other notebooks

with open('data/case_vector_pairs_keys.pickle', 'wb') as handle:
    pickle.dump(case_vector_pairs_keys, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/case_vector_pairs_values.pickle', 'wb') as handle:
    pickle.dump(case_vector_pairs_values, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/encoded_case_vector_dict.pickle', 'wb') as handle:
    pickle.dump(encoded_case_vector_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/encoded_cases.pickle', 'wb') as handle:
    pickle.dump(encoded_cases, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/encoded_cases_int.pickle', 'wb') as handle:
    pickle.dump(encoded_cases_int, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('data/col_values_dict.pickle', 'wb') as handle:
    pickle.dump(col_values_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)