In [25]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import KFold
import torch

In [27]:
def load_csv(filename, dropna=True, fillna=False):
    """
    Gets a filename (csv) and returns a pandas dataframe without NAs
    """
    data = pd.read_csv(filename)
    data['orig_choice_num'] = data.index % 150
    if fillna is True:
        return data.fillna(-1)
    if dropna is True:
        return data.dropna()
    
    return data

create saving dirs for different sequence lengths experiments

In [28]:
saving_dir = "cross_validation/diff_seq_lengths/"
if not os.path.exists(saving_dir):
    os.makedirs(saving_dir)
    
seq_lengths_to_produce = [2,3,4,5,6,7,8,9,10,11,12]

saving_dirs = []
for s in seq_lengths_to_produce:
    cur_saving_dir = os.path.join(saving_dir, str(s))
    saving_dirs.append(cur_saving_dir)
    if not os.path.exists(cur_saving_dir):
        os.makedirs(cur_saving_dir)

 load task data

In [29]:
filename = 'Data/DataAllSubjectsRewards.csv'
data = load_csv(filename)
data_withNA = load_csv(filename, fillna=True)
data_type = "original"

orig_choice_num -> ranges from 0 to 149

In [30]:
data.choice = data.choice.astype(int)
data.reward = data.reward.astype(int)
data.user = data.user.astype(int)

payoff2_users = data[data['payoff_structure']==2].user.unique()
payoff3_users = data[data['payoff_structure']==3].user.unique()
payoff4_users = data[data['payoff_structure']==4].user.unique()
print(saving_dir)

cross_validation/diff_seq_lengths/


In [34]:
def get_kfold_participants(payoff_number, payoff_pars, k=5, save_pickle=True):
    """
    Getting the payoff's participants and its number and saves the kfold split as a list of participants lists. Each saved as pickle file.
    Return values are for debugging
    """
    kf = KFold(n_splits=k)
    train_pars = []
    test_pars = []
    for train_index, test_index in kf.split(payoff_pars):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        train_pars.append(payoff_pars[train_index])
        test_pars.append(payoff_pars[test_index])
    
    if save_pickle:
        filename = 'payoff{}_train_participants_{}fold_list.pkl'.format(payoff_number, k)
        with open(os.path.join(saving_dir,filename), 'wb') as f:
            pickle.dump(train_pars, f)
        print(os.path.join(saving_dir,filename))

        filename = 'payoff{}_test_participants_{}fold_list.pkl'.format(payoff_number, k)
        with open(os.path.join(saving_dir,filename), 'wb') as f:
            pickle.dump(test_pars, f)
        print(os.path.join(saving_dir,filename))
            
    return train_pars, test_pars

payoff2_trains, payoff2_tests = get_kfold_participants(payoff_number=2, payoff_pars=payoff2_users)
payoff3_trains, payoff3_tests = get_kfold_participants(payoff_number=3, payoff_pars=payoff3_users)
payoff4_trains, payoff4_tests = get_kfold_participants(payoff_number=4, payoff_pars=payoff4_users)

cross_validation/diff_seq_lengths/payoff2_train_participants_5fold_list.pkl
cross_validation/diff_seq_lengths/payoff2_test_participants_5fold_list.pkl
cross_validation/diff_seq_lengths/payoff3_train_participants_5fold_list.pkl
cross_validation/diff_seq_lengths/payoff3_test_participants_5fold_list.pkl
cross_validation/diff_seq_lengths/payoff4_train_participants_5fold_list.pkl
cross_validation/diff_seq_lengths/payoff4_test_participants_5fold_list.pkl


-----------------

In [35]:
def create_participants_dataframes(data, withNA=False, drop_first_choice=True, is_generated=False, save=False):
    """
    Gets a dataframe and returns a list of dataframes - each is one participant's trial, with the prev_choice and prev_reward columns - updated per participant
    ChangeLog:
    29.8.19 - added drop_first_choice - so I can create a pd_list with original data without dropping anything
    29.8.19 - added withNA - so I can save the csvs to a different location
    25.1.20 - added the option to save generated data
    """
    # pd_list is a list of dataframes - each is one participant's trial
    pd_list = []
    user_list = data.user.unique()
    for user_id in user_list:
        user_data = data[data['user']==user_id].copy().reset_index(drop=True)

        # create prev_choice
        current_user_choice = [int(x) for x in list(user_data.choice)]
        if drop_first_choice is True:
            current_user_choice.insert(0,None)
        user_data['prev_choice'] = pd.Series(current_user_choice)
        if drop_first_choice is True:
            user_data.drop(index=1)

        # create prev_reward
        current_user_reward = [int(x) for x in list(user_data.reward)]
        if drop_first_choice is True:
            current_user_reward.insert(0,None)
        user_data['prev_reward'] = pd.Series(current_user_reward)

        # get rid of the None values - meaning the first instance is dropped
        if drop_first_choice is True:
            user_data = user_data.dropna()
        user_data['prev_choice'] = pd.Series(user_data.prev_choice, dtype='int64')
        user_data['prev_reward'] = pd.Series(user_data.prev_reward, dtype='int64')
        
        if save:
            if withNA is False:
                if is_generated:
                    user_data.to_csv("generated_data/"+str(user_id)+".csv", index=False)
                else:
                    user_data.to_csv("data/"+str(user_id)+".csv", index=False)
            else:
                user_data.to_csv("data_original_all_user_withNA/"+str(user_id)+".csv", index=False)


        pd_list.append(user_data.copy())
    return pd_list

In [36]:
def create_participants_dataframes_generated(data, drop_first_choice=True):
    """
    Gets a generated dataframe and returns a list of dataframes - each is one participant's trial, with the prev_choice and prev_reward columns - updated per participant 
    notice : generated means that there won't be any unexpected Nones or gaps in data
    """
    # pd_list is a list of dataframes - each is one participant's trial
    pd_list = []
    user_list = data.user.unique()
    for user_id in user_list:
        user_data = data[data['user']==user_id].copy().reset_index(drop=True)

        # create prev_choice
        current_user_choice = [int(x) for x in list(user_data.choice)]

        current_user_choice.insert(0,None)
        current_user_choice = np.delete(current_user_choice,-1)
        user_data['prev_choice'] = pd.Series(current_user_choice)
        
        
        # create prev_reward
        current_user_reward = [int(x) for x in list(user_data.reward)]
        
        current_user_reward.insert(0,None)
        current_user_reward = np.delete(current_user_reward,-1)
        user_data['prev_reward'] = pd.Series(current_user_reward)

        # get rid of the None values - meaning the first instance is dropped
        user_data = user_data.dropna()
        user_data['prev_choice'] = pd.Series(user_data.prev_choice, dtype='int64')
        user_data['prev_reward'] = pd.Series(user_data.prev_reward, dtype='int64')
        

        user_data.to_csv("generated_data/"+str(user_id)+".csv", index=False)


        pd_list.append(user_data.copy())
    return pd_list

In [37]:
pd_list = create_participants_dataframes(data, withNA=False, drop_first_choice=True)

In [38]:
data[data['user']==76].head(50)

Unnamed: 0,user,choice,reward,time,payoff_structure,reward_1,reward_2,reward_3,reward_4,orig_choice_num
11250,76,1,84,2207.0,2,84,87,42,23,0
11251,76,2,90,1059.0,2,90,90,46,18,1
11252,76,4,28,1267.0,2,80,84,53,28,2
11253,76,2,81,1482.0,2,87,81,50,24,3
11254,76,1,86,1162.0,2,86,92,61,28,4
11255,76,3,55,1556.0,2,75,78,55,30,5
11256,76,3,50,926.0,2,71,78,50,34,6
11257,76,2,83,1287.0,2,75,83,58,43,7
11258,76,2,80,912.0,2,77,80,52,30,8
11259,76,2,80,903.0,2,70,80,43,28,9


Optional: save this dataframe, for debugging mostly

In [39]:
# filename = 'pd_list_full_withNA_rewards_{}_seq{}.pkl'.format(data_type,seq_length)
# with open(filename, 'wb') as f:
#     pickle.dump(pd_list, f)

In [40]:
seq_length = 4
filename = 'pd_list_full_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length)
with open(filename, 'wb') as f:
    pickle.dump(pd_list, f)

In [14]:
#filename = os.path.join('explain/','pd_list_full_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length))
#with open(filename, 'wb') as f:
#    pickle.dump(new_pd_list, f)

### Create sliding windows + shuffle

In [15]:
# in case needs to load
# with open('pd_list_full_with_rewards_original_seq4_SHUFFLED_retry.pkl', 'rb') as f:
#     pd_list = pickle.load(f)

In [41]:
with open('pd_list_full_with_rewards_original_seq4.pkl', 'rb') as f:
    pd_list = pickle.load(f)

In [17]:
# pd_list = [pd.reset_index(drop=True) for pd in pd_list]

In [42]:
def arrange_moving_window(cur_data,seq_length):
    """
    Gets a pd with current user data and returns a list of indices with a moving window of seq_length - to later be reshaped
    """
    indexer = np.arange(seq_length)[None, :] + np.arange(cur_data.shape[0])[:, None]
    list_of_indices = []
    for i in indexer.flatten():
        if (i+1)==cur_data.shape[0]:
            list_of_indices.append(i+1)
            break
        else:
            list_of_indices.append(i+1)
    return list_of_indices

In [43]:
def combine_with_moving_window(pd_list, seq_length=4):
    """
    Gets the df list and returns a full dataset after creating sliding windows per user and concatenating all together
    """
    new_pd_list = []
    for user in pd_list:
        new_pd = pd.DataFrame(user,index=arrange_moving_window(user,seq_length)).reset_index()
        new_pd_list.append(new_pd)
    huge_pd = pd.concat(new_pd_list)
    
    return huge_pd, new_pd_list

In [44]:
huge_pd, pd_list_windowed = combine_with_moving_window(pd_list)
# huge_pd, pd_list_windowed = combine_with_moving_window(new_pd_list)

In [45]:
# filename_huge_pd = 'huge_pd_with_rewards_{}_seq{}_retry.pkl'.format(data_type,seq_length)
# with open(filename_huge_pd, 'wb') as f:
#     pickle.dump(huge_pd, f)

# filename_huge_pd = 'huge_pd_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length)
# with open(filename_huge_pd, 'wb') as f:
#     pickle.dump(huge_pd, f)

# filename_pd_list_windowed = 'pd_list_windowed_full_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length)
# with open(filename_pd_list_windowed, 'wb') as f:
#     pickle.dump(pd_list_windowed, f)

filename_pd_list_windowed =  os.path.join('explain/','pd_list_windowed_full_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length))
with open(filename_pd_list_windowed, 'wb') as f:
    pickle.dump(pd_list_windowed, f)

filename_huge_pd = os.path.join('explain/','huge_pd_with_rewards_{}_seq{}.pkl'.format(data_type,seq_length))
with open(filename_huge_pd, 'wb') as f:
    pickle.dump(huge_pd, f)

In [46]:
def get_last_seq(par_df, choice_number, seq_len):
    """
    Gets a participant's data and a choice number and returns a sequence of previous 4 choices made (maybe not 4 consecutives)
    """
    if choice_number < seq_len :
        print("must be {} and more to get a sequence".format(seq_len))
        return -1
    ind = np.where(t["orig_choice_num"] == choice_number)
    if len(ind[0]) == 0 :
        print("choice number {} not found".format(choice_number))
        return -1
    elif len(ind[0]) > 1:
        print("FOR SOME REASON THERE'S MORE THAN ONE CHOICE NUMBER LIKE {}".format(choice_number))
        return -1
    # got here so I found the choice number , now lets check if we have 4 previous choices
    seq = []
    for i in range(ind[0][0]-seq_len+1, ind[0][0]+1,1):
        seq.append(i)
    
    return par_df.iloc[seq] 

In [47]:
def create_data_sequence(seq_df,seq_len=4):
    """
    Gets a SINGLE sequence in df form and returns it in a train/test ready form (numpy) X and y
    """
    cur_data = seq_df.copy()
    cur_data['choice'] = cur_data.choice.apply(lambda x: x-1)
    cur_data['prev_choice'] = cur_data.prev_choice.apply(lambda x: x-1)
    X = cur_data.drop(columns = ['choice','user','time','reward','payoff_structure','reward_1','reward_2','reward_3','reward_4','orig_choice_num'])
    X_prev = torch.nn.functional.one_hot(X.prev_choice, num_classes=num_classes).to(torch.int64)
 
    y = cur_data.choice
    num_of_classes = len(y.unique())
    y = to_categorical(y,dtype='int64')

    new_X = []
    for prev_choice,prev_reward in zip(X_prev,X.prev_reward):
        new_i = np.append(prev_choice,prev_reward)
        new_X.append(new_i)
    new_X = np.array(new_X)
    
    # reshape X to be [samples, time steps, features]
    X_reshaped = np.reshape(new_X, (1, seq_length, new_X.shape[1]))
    y_reshaped = np.reshape(y, (1, seq_length, y.shape[1]))
    
    return X_reshaped, y_reshaped

In [48]:
with open('huge_pd_with_rewards_original_seq4_SHUFFLED.pkl', 'rb') as f:
    huge_pd_4_shuffled = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'huge_pd_with_rewards_original_seq4_SHUFFLED.pkl'

In [None]:
# with open('pd_list_full_with_rewards_original_seq4.pkl', 'rb') as f:
#     pd_list_without_na = pickle.load(f)

In [49]:
def create_new_shuffled_pd_list(pd_list_without_na, seq_length=4):
    new_pd_list = []
    for df in pd_list_without_na:
        # this gives array of 1 to length(regular_data)
        length_array = np.arange((df.shape[0]//seq_length)*seq_length)
        regular_data = df.head((df.shape[0]//seq_length)*seq_length)
        d={x : y for x, y in regular_data.groupby(length_array//seq_length)}
        new_df = pd.concat([d.get(x) for x in np.random.choice(len(d),len(d.keys()),replace=False)])
        new_pd_list.append(new_df)
    
    return new_pd_list

In [35]:
# new_pd_list = create_new_shuffled_pd_list(pd_list_without_na)

In [51]:
def create_new_shuffled_huge_pd(huge_pd, seq_length=4):
    # this gives array of 1 to length(regular_data)
    length_array = np.arange((huge_pd.shape[0]//seq_length)*seq_length)
    regular_data = huge_pd.head((huge_pd.shape[0]//seq_length)*seq_length)
    d={x : y for x, y in regular_data.groupby(length_array//seq_length)}
    new_df = pd.concat([d.get(x) for x in np.random.choice(len(d),len(d.keys()),replace=False)])
    
    return new_df

In [52]:
huge_pd_shuffled = create_new_shuffled_huge_pd(huge_pd)

In [53]:
filename_huge_pd_shuffled = 'huge_pd_shuffled_with_rewards_{}_seq{}_SHUFFLED.pkl'.format(data_type,seq_length)
with open(filename_huge_pd_shuffled, 'wb') as f:
    pickle.dump(huge_pd_shuffled, f)

In [45]:
# filename_huge_pd_shuffled = os.path.join('explain/','huge_pd_shuffled_with_rewards_{}_seq{}_SHUFFLED.pkl'.format(data_type,seq_length))
# with open(filename_huge_pd_shuffled, 'wb') as f:
#     pickle.dump(huge_pd_shuffled, f)

### Creating sequences in different sizes

In [54]:
with open('pd_list_full_with_rewards_original_seq4.pkl', 'rb') as f:
    ready_pd_list = pickle.load(f)

In [55]:
all_huge_pds = []
for i, seq_len in enumerate(seq_lengths_to_produce):
    print(f"creating data with seq of {seq_len}")
    cur_huge_pd, cur_pd_list_windowed = combine_with_moving_window(ready_pd_list, seq_length=seq_len)
    cur_huge_pd_shuffled = create_new_shuffled_huge_pd(cur_huge_pd, seq_length=seq_len)
    
    filename_huge_pd_shuffled = os.path.join(saving_dirs[i], 'huge_pd_shuffled_with_rewards_{}_seq{}_SHUFFLED.pkl'.format(data_type,seq_len))
    print(f"saving at {filename_huge_pd_shuffled}")
    with open(filename_huge_pd_shuffled, 'wb') as f:
        pickle.dump(cur_huge_pd_shuffled, f)
    all_huge_pds.append(cur_huge_pd_shuffled)

creating data with seq of 2
saving at cross_validation/diff_seq_lengths/2/huge_pd_shuffled_with_rewards_original_seq2_SHUFFLED.pkl
creating data with seq of 3
saving at cross_validation/diff_seq_lengths/3/huge_pd_shuffled_with_rewards_original_seq3_SHUFFLED.pkl
creating data with seq of 4
saving at cross_validation/diff_seq_lengths/4/huge_pd_shuffled_with_rewards_original_seq4_SHUFFLED.pkl
creating data with seq of 5
saving at cross_validation/diff_seq_lengths/5/huge_pd_shuffled_with_rewards_original_seq5_SHUFFLED.pkl
creating data with seq of 6
saving at cross_validation/diff_seq_lengths/6/huge_pd_shuffled_with_rewards_original_seq6_SHUFFLED.pkl
creating data with seq of 7
saving at cross_validation/diff_seq_lengths/7/huge_pd_shuffled_with_rewards_original_seq7_SHUFFLED.pkl
creating data with seq of 8
saving at cross_validation/diff_seq_lengths/8/huge_pd_shuffled_with_rewards_original_seq8_SHUFFLED.pkl
creating data with seq of 9
saving at cross_validation/diff_seq_lengths/9/huge_pd_s

In [56]:
filename_all_huge_pd_shuffled = os.path.join(saving_dir, 'all_huge_pd_shuffled_with_rewards_{}_seq{}_to_seq{}_SHUFFLED_LIST.pkl'.format(data_type,seq_lengths_to_produce[0],seq_lengths_to_produce[-1]))
print(f"saving at {filename_all_huge_pd_shuffled}")
with open(filename_all_huge_pd_shuffled, 'wb') as f:
    pickle.dump(all_huge_pds, f)

saving at cross_validation/diff_seq_lengths/all_huge_pd_shuffled_with_rewards_original_seq2_to_seq12_SHUFFLED_LIST.pkl


In [57]:
cur_huge_pd_shuffled

Unnamed: 0,index,user,choice,reward,time,payoff_structure,reward_1,reward_2,reward_3,reward_4,orig_choice_num,prev_choice,prev_reward
1140,96,122,3,68,757.0,2,33,56,68,40,96,4,39
1141,97,122,3,71,745.0,2,37,55,71,48,97,3,68
1142,98,122,3,78,711.0,2,36,51,78,45,98,3,71
1143,99,122,3,79,1025.0,2,27,54,79,47,99,3,78
1144,100,122,3,79,749.0,2,35,59,79,46,100,3,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,104,281,3,85,930.0,2,35,58,85,38,105,3,83
1160,105,281,3,71,799.0,2,35,55,71,41,106,3,85
1161,106,281,3,74,906.0,2,34,52,74,42,107,3,71
1162,107,281,3,83,725.0,2,23,59,83,46,108,3,74
