In [22]:
# randomly pick the existing design, self-generated semantic matrix
import pandas as pd
import numpy as np
import pickle

### Load Real Design

In [23]:
df = pd.read_csv("simu1_data/cr_preproc_data_mturk.csv")
df

Unnamed: 0,rt,time_elapsed,subject_ID,item,lag,category,confidence,correct,correct_num,category_label,...,position,old,yes,block_type,prev_cat,prev_cat_match,prev_cat_label,prev_cat_label_match,curr_cat_length,curr_cat_label_length
0,1640.345,71781.0,120,ARM,0,Uncategorized,1.0,True,1,BodyParts,...,0,False,False,Uncategorized,,,,False,,0
1,845.750,71783.0,422,CELLO,0,Uncategorized,5.0,False,0,Instruments,...,0,False,True,Uncategorized,,,,False,,0
2,,72233.0,52,HALLWAY,0,Building,,False,0,Building,...,0,False,,Categorized,,False,,False,0.0,0
3,,72813.0,20,MOUNTAIN,0,Uncategorized,,False,0,Landscapes,...,0,False,,Uncategorized,,,,False,,0
4,,73029.0,108,ROBIN,0,Uncategorized,,False,0,Birds,...,0,False,,Uncategorized,,,,False,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381055,,12140412.0,228,CARROT,163,Vegetables,,False,0,Vegetables,...,575,True,,Categorized,Landscapes,False,Landscapes,False,20.0,20
381056,,12144498.0,228,GOOSE,112,Uncategorized,,False,0,Birds,...,576,True,,Uncategorized,Vegetables,,Vegetables,False,,21
381057,,12148572.0,228,PEAS,68,Vegetables,,False,0,Vegetables,...,577,True,,Categorized,Uncategorized,False,Birds,False,21.0,21
381058,,12152653.0,228,SAW,159,Tools,,False,0,Tools,...,578,True,,Categorized,Vegetables,False,Vegetables,False,22.0,22


In [24]:
# drop redundant columns
df = df.drop(
    [
        "rt",
        "time_elapsed",
        "correct",
        "correct_num",
        "block_type",
        "item_name",
        "prev_cat",
        "prev_cat_match",
        "prev_cat_label",
        "prev_cat_label_match",
        "curr_cat_length",
        "curr_cat_label_length",
        "confidence",
        "category",
    ],
    axis=1,
)
df

Unnamed: 0,subject_ID,item,lag,category_label,position,old,yes
0,120,ARM,0,BodyParts,0,False,False
1,422,CELLO,0,Instruments,0,False,True
2,52,HALLWAY,0,Building,0,False,
3,20,MOUNTAIN,0,Landscapes,0,False,
4,108,ROBIN,0,Birds,0,False,
...,...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True,
381056,228,GOOSE,112,Birds,576,True,
381057,228,PEAS,68,Vegetables,577,True,
381058,228,SAW,159,Tools,578,True,


In [25]:
# drop subjects with > 250 no responses
subjlist = df.subject_ID.to_numpy()
subjlist = np.unique(subjlist)
discard = []
for subj in subjlist:
    df_subj = df.loc[df.subject_ID == subj]
    no_ans = np.isnan(df_subj.yes.to_numpy().astype("float"))
    num_no_ans = np.sum(no_ans)
    if num_no_ans > 250:
        discard.append(subj)
discard.append(200)  # additional, see David
df = df.loc[df.subject_ID.isin(discard) == False].copy()
df.drop("yes", axis=1, inplace=True)

In [26]:
# how many subjects
subjlist = df.subject_ID.to_numpy()
subjlist = np.unique(subjlist)
len(subjlist)

548

In [27]:
# add itemno
items = np.unique(df.item)
item2no = {}
for i in range(len(items)):
    item2no[items[i]] = i + 1
df["itemno"] = df.apply(lambda x: item2no[x.loc["item"]], 1)
df

Unnamed: 0,subject_ID,item,lag,category_label,position,old,itemno
4,108,ROBIN,0,Birds,0,False,220
6,536,WALL,0,Building,0,False,288
15,108,TURKEY,0,FarmAnimals,1,False,280
17,469,TABLET,0,Electronics,0,False,263
18,452,NOTEBOOK,0,OfficeSupplies,0,False,177
...,...,...,...,...,...,...,...
381055,228,CARROT,163,Vegetables,575,True,43
381056,228,GOOSE,112,Birds,576,True,119
381057,228,PEAS,68,Vegetables,577,True,194
381058,228,SAW,159,Tools,578,True,228


In [28]:
# organize
df = df.sort_values(by=["subject_ID", "position"])
df = df.reset_index(drop=True)
df = df[["subject_ID", "position", "item", "itemno", "category_label", "lag", "old"]]
df

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old
0,0,0,CHICKEN,53,FarmAnimals,0,False
1,0,1,BLUEBERRY,26,Fruit,0,False
2,0,2,BUS,33,Vehicles,0,False
3,0,3,LEMON,151,Fruit,0,False
4,0,4,OYSTER,187,OceanAnimals,0,False
...,...,...,...,...,...,...,...
317835,656,575,COLA,60,Beverages,37,True
317836,656,576,BASEBALL,10,Toys,115,True
317837,656,577,TEA,265,Beverages,24,True
317838,656,578,RADIO,213,Electronics,41,True


### Design

In [29]:
rng = np.random.default_rng(seed=42)
simu_sess_num = 1000

subjectlist = np.unique(df.subject_ID)
sess = rng.choice(subjectlist, simu_sess_num)
sess

array([ 64, 518, 432, 285, 280, 572,  63, 466, 134,  67, 344, 642, 493,
       510, 480, 526, 337,  90, 561, 292, 328, 243, 123, 611, 524, 426,
       262, 551, 357, 287, 292, 148,  66, 362, 590,  47, 572, 554, 180,
       419, 111, 508, 467, 232,  50, 639, 289, 593, 451, 521, 509, 130,
       237, 307, 326,  32, 358, 105, 499, 454, 609, 501, 239, 638, 267,
       214, 600, 243,  55, 309, 533, 126, 304,  91, 457, 312, 216, 148,
       369, 446, 620, 282, 109, 557, 417, 467,  69, 207, 514, 557, 281,
       538, 562, 254, 596, 189, 157, 454, 421,  96, 557, 133, 538,   5,
       533, 527, 523, 441, 310, 470, 180, 523, 363, 301, 332, 371,  27,
        96, 161,  79, 285, 445, 432, 310, 570, 369,  58, 513, 377, 420,
       369, 362,  65, 366, 532, 199, 397,  19, 227, 282, 646, 141, 180,
       265, 652, 568,  23, 153, 551,  43, 570, 184, 606, 192, 281, 438,
        89, 365, 331, 525, 654, 441, 266, 264, 271, 545, 212, 112, 219,
        14,  74,  65, 516, 482, 465, 304, 478, 109, 597, 328, 61

In [30]:
df_test = pd.DataFrame()
for i in range(len(sess)):
    tmp = df.loc[df.subject_ID == sess[i], :].copy()
    tmp["session"] = i
    df_test = pd.concat([df_test, tmp])
df_test = df_test.reset_index(drop=True)
df_test["study_itemno1"] = df_test["itemno"]
df_test["study_itemno2"] = -1
df_test["test_itemno1"] = df_test["itemno"]
df_test["test_itemno2"] = -1
df_test

Unnamed: 0,subject_ID,position,item,itemno,category_label,lag,old,session,study_itemno1,study_itemno2,test_itemno1,test_itemno2
0,64,0,ERASER,96,OfficeSupplies,0,False,0,96,-1,96,-1
1,64,1,PEN,196,OfficeSupplies,0,False,0,196,-1,196,-1
2,64,2,PENCIL,197,OfficeSupplies,0,False,0,197,-1,197,-1
3,64,3,VAN,285,Vehicles,0,False,0,285,-1,285,-1
4,64,4,LETTUCE,153,Vegetables,0,False,0,153,-1,153,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
579995,335,575,PAPER,190,OfficeSupplies,0,False,999,190,-1,190,-1
579996,335,576,FORK,110,KitchenTools,0,False,999,110,-1,110,-1
579997,335,577,NOTEBOOK,177,OfficeSupplies,0,False,999,177,-1,177,-1
579998,335,578,DOLLHOUSE,81,Toys,0,False,999,81,-1,81,-1


In [31]:
# save
with open("simu1_data/simu1_design.pkl", "wb") as outp:
    pickle.dump(df_test, outp, pickle.HIGHEST_PROTOCOL)