In [3]:
# randomly pick the existing design, self-generated random semantic matrix
import numpy as np
import pandas as pd
import pickle

### Load Real Design

In [4]:
df = pd.read_table("simu2_data/RN2_Pix.dat", sep="\s+", names=['subject','session','list','recog_pos','picture','category','study_pos','old_lag','study_lag','confidence','rt'])
df

Unnamed: 0,subject,session,list,recog_pos,picture,category,study_pos,old_lag,study_lag,confidence,rt
0,103,1,0,1,VA084,_OLD_,6,-999,59,6,705
1,103,1,0,2,GBEN253,_OLD_,5,-1,61,6,936
2,103,1,0,3,ONTO011,_OLD_,3,-2,64,6,678
3,103,1,0,4,ISBO1,_NEW_,-99,-999,-9999,4,1461
4,103,1,0,5,PAGETT11,_OLD_,38,-999,31,5,1799
...,...,...,...,...,...,...,...,...,...,...,...
69883,99,1,5,124,CHN639,_NEW_,-99,-999,-9999,4,1460
69884,99,1,5,125,CT173,_OLD_,39,-999,150,2,1040
69885,99,1,5,126,GRTO2,_NEW_,-99,-999,-9999,2,1269
69886,99,1,5,127,GR022,_OLD_,46,-999,145,1,887


In [5]:
# pic to itemno & old
pics = np.unique(df.picture)
pic2itemno = {}
for i in range(len(pics)):
    pic2itemno[pics[i]] = i + 1

itemnos = []
for pic in df.picture:
    itemnos.append(pic2itemno[pic])

df['itemno'] = itemnos
df['old'] = df.category == '_OLD_'
df = df.sort_values(by = ['subject','list','recog_pos'])
df

Unnamed: 0,subject,session,list,recog_pos,picture,category,study_pos,old_lag,study_lag,confidence,rt,itemno,old
36096,37,1,0,1,CHN211,_OLD_,9,-999,56,6,1053,195,True
36097,37,1,0,2,GBEN655,_OLD_,55,46,11,2,2306,493,True
36098,37,1,0,3,ECU104,_OLD_,52,-3,15,3,1457,376,True
36099,37,1,0,4,MXACP4,_OLD_,47,-5,21,1,2333,998,True
36100,37,1,0,5,CHN118,_NEW_,-99,-999,-9999,1,1704,184,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36091,172,1,5,124,JEROCTWL,_NEW_,-99,-999,-9999,2,1076,801,False
36092,172,1,5,125,THA064,_NEW_,-99,-999,-9999,6,808,1335,False
36093,172,1,5,126,RI003,_OLD_,28,-999,162,4,999,1245,True
36094,172,1,5,127,IND117,_NEW_,-99,-999,-9999,2,1114,747,False


In [7]:
# organize
df = df.drop(columns=['session','picture','category','confidence','rt'])
df = df[['subject', 'list', 'recog_pos', 'itemno', 'old', 'old_lag', 'study_pos', 'study_lag']]
df

Unnamed: 0,subject,list,recog_pos,itemno,old,old_lag,study_pos,study_lag
36096,37,0,1,195,True,-999,9,56
36097,37,0,2,493,True,46,55,11
36098,37,0,3,376,True,-3,52,15
36099,37,0,4,998,True,-5,47,21
36100,37,0,5,184,False,-999,-99,-9999
...,...,...,...,...,...,...,...,...
36091,172,5,124,801,False,-999,-99,-9999
36092,172,5,125,1335,False,-999,-99,-9999
36093,172,5,126,1245,True,-999,28,162
36094,172,5,127,747,False,-999,-99,-9999


### Design

In [8]:
rng = np.random.default_rng(seed=42)
simu_sess_num = 1000
simu_old_num = 64
simu_new_num = 64

subj_unique = np.unique(df.subject)
sess = rng.choice(subj_unique, simu_sess_num)
sess

array([ 50, 137, 123,  95,  95, 153,  49, 127,  65,  50, 109, 169, 131,
       136, 130, 138, 105,  56, 147,  96, 104,  86,  63, 164, 138, 122,
        90, 143, 112,  96,  96,  68,  50, 113, 157,  46, 153, 146,  74,
       121,  62, 134, 127,  85,  47, 169,  96, 158, 125, 137, 136,  64,
        86,  98, 104,  41, 112,  60, 133, 126, 163, 133,  86, 169,  92,
        81, 160,  86,  47,  98, 140,  64,  98,  56, 126,  99,  82,  68,
       114, 124, 165,  95,  60, 146, 121, 127,  50,  80, 136, 146,  95,
       141, 147,  89, 158,  75,  69, 126, 121,  57, 146,  65, 141,  37,
       140, 138, 138, 124,  98, 128,  74, 138, 113,  97, 105, 114,  41,
        57,  70,  54,  96, 124, 123,  98, 151, 114,  49, 136, 116, 121,
       114, 113,  50, 113, 140,  79, 118,  39,  84,  95, 171,  67,  74,
        92, 172, 151,  41,  69, 143,  46, 151,  74, 163,  75,  95, 124,
        56, 113, 104, 138, 172, 124,  92,  90,  94, 143,  81,  62,  82,
        39,  51,  50, 137, 130, 127,  98, 130,  60, 158, 104, 16

In [9]:
df_study = pd.DataFrame()
df_test = pd.DataFrame()

for i in range(len(sess)):
    tmp = df.loc[df.subject == sess[i]].copy()
    tmp["session"] = i
    
    df_test = pd.concat([df_test, tmp])
    
    tmp_study = tmp.loc[tmp.old == True].copy()
    tmp_study = tmp_study.sort_values(by = ['list', 'study_pos'])
    df_study = pd.concat([df_study, tmp_study])
    
df_study.reset_index(drop = True, inplace = True)
df_test.reset_index(drop = True, inplace = True)

In [10]:
df_study = df_study.drop(columns=['recog_pos', 'old', 'old_lag','study_lag'])
df_study

Unnamed: 0,subject,list,itemno,study_pos,session
0,50,0,1328,1,0
1,50,0,768,2,0
2,50,0,518,3,0
3,50,0,21,4,0
4,50,0,914,5,0
...,...,...,...,...,...
383995,105,5,485,60,999
383996,105,5,744,61,999
383997,105,5,484,62,999
383998,105,5,508,63,999


In [11]:
df_test

Unnamed: 0,subject,list,recog_pos,itemno,old,old_lag,study_pos,study_lag,session
0,50,0,1,490,True,-999,52,13,0
1,50,0,2,541,True,9,61,5,0
2,50,0,3,1066,False,-999,-99,-9999,0
3,50,0,4,942,True,-999,34,34,0
4,50,0,5,1464,False,-999,-99,-9999,0
...,...,...,...,...,...,...,...,...,...
767995,105,5,124,371,True,18,45,143,999
767996,105,5,125,1259,False,-999,-99,-9999,999
767997,105,5,126,382,True,-999,46,144,999
767998,105,5,127,1295,True,-45,1,190,999


In [12]:
with open('simu2_data/simu2_design.pkl', 'wb') as outp:
    pickle.dump(df_study, outp, pickle.HIGHEST_PROTOCOL)
    pickle.dump(df_test, outp, pickle.HIGHEST_PROTOCOL)