In [1]:
import os 
import os.path as osp
import pandas as pd
import torch
import joblib

# Initial Settings

In [2]:
DATA_DIR = '/mnt/sda/hong01-data/MART_DATA/OUTPUT_MERGED'
CSV_DIR = osp.join(DATA_DIR, 'PANDAS')
IMG_DIR = osp.join(DATA_DIR, 'AUTOGRAPHER')
SCREENSHOT_DIR = osp.join(DATA_DIR, 'SCREENSHOTS')

In [3]:
events_text = {'act00': 'calibration',
                'act01': 'write an email',
                'act02': 'read on screen',
                'act03': 'edit/create presentation',
                'act04': 'zone out and fixate',
                'act05': 'use a calculator to add up numbers on sheet',
                'act06': 'physical precision task',
                'act07': 'put documents in order',
                'act08': 'read text/numbers on page',
                'act09': 'arrange money in change jar',
                'act10': 'write on paper with pen',
                'act11': 'watch a youtube video',
                'act12': 'go to a news website and browse',
                'act13': 'have conversation with experimenter in room',
                'act14': 'make a telephone call',
                'act15': 'drink/eat for 2 minutes',
                'act16': 'close eyes and sit still',
                'act17': 'clean e.g. sweaping the floor, wipe, ...',
                'act18': 'exercise: sit up/stand down repeatedly',
                'act19': 'hand-eye coordination (tennis ball)',
                'act20': 'pace the room',
                }

# Data Preparation

In [4]:
dfA = pd.read_csv(osp.join(CSV_DIR, 'trainA.csv'), index_col=0)
dfB = pd.read_csv(osp.join(CSV_DIR, 'trainB.csv'), index_col=0)
df = pd.concat([dfA, dfB])
df = df.sort_values(by=['event_id', 'sub_id'])

In [5]:
df

Unnamed: 0,sub_id,event_id,source,data_HR_activity_median,data_HR_activity_min,data_HR_activity_max,data_HR_activity_average,data_HR_activity_std,data_HR_activity_len,data_LEFT_ACC_MAG_median,...,"data_AUTOGRAPHER_RESNET_max_buckeye, horse chestnut, conker",data_AUTOGRAPHER_RESNET_max_coral fungus,data_AUTOGRAPHER_RESNET_max_agaric,data_AUTOGRAPHER_RESNET_max_gyromitra,"data_AUTOGRAPHER_RESNET_max_stinkhorn, carrion fungus",data_AUTOGRAPHER_RESNET_max_earthstar,"data_AUTOGRAPHER_RESNET_max_hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",data_AUTOGRAPHER_RESNET_max_bolete,"data_AUTOGRAPHER_RESNET_max_ear, spike, capitulum","data_AUTOGRAPHER_RESNET_max_toilet tissue, toilet paper, bathroom tissue"
0,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000728,0.000090,0.000920,0.000048,0.000100,0.000287,0.000317,0.000290,0.001594,0.001767
0,1001,act01,trainB,85.714286,81.081081,92.307692,86.525331,2.709545,90,0.000850,...,0.001672,0.000244,0.003576,0.000128,0.000239,0.000618,0.000757,0.000928,0.005534,0.009232
0,1002,act01,trainA,68.181818,60.606061,75.000000,67.920213,3.254272,90,-0.002704,...,0.000050,0.000004,0.000071,0.000002,0.000003,0.000009,0.000100,0.000031,0.000402,0.000734
0,1002,act01,trainB,71.428571,67.415730,77.922078,71.630717,2.240689,90,-0.003506,...,0.001901,0.000151,0.000966,0.000082,0.000151,0.000299,0.000926,0.000762,0.008168,0.005427
0,1003,act01,trainA,74.074074,66.666667,78.947368,73.875941,2.166754,90,0.021029,...,0.006064,0.000295,0.007819,0.000449,0.000692,0.003246,0.004795,0.002219,0.007301,0.066273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19,1005,act20,trainB,98.360656,84.507042,105.263158,98.556521,3.391250,89,0.006852,...,0.002078,0.000269,0.000881,0.000034,0.000167,0.000108,0.000597,0.000377,0.001633,0.750323
19,1006,act20,trainA,92.307692,85.714286,100.000000,91.705138,2.563889,90,0.003494,...,0.001213,0.001381,0.001589,0.000075,0.000245,0.000297,0.001900,0.000491,0.003848,0.422092
19,1006,act20,trainB,93.750000,84.507042,103.448276,94.153427,4.012861,89,0.005112,...,0.001420,0.001601,0.001030,0.000053,0.000363,0.000531,0.001294,0.000652,0.005349,0.296006
19,1007,act20,trainA,89.552239,74.074074,96.774194,87.873433,4.945947,89,0.021470,...,0.000450,0.003216,0.001190,0.000040,0.000681,0.000378,0.000685,0.000530,0.010341,0.711271


In [6]:
imgs = sorted(os.listdir(IMG_DIR))
imgs = [item[:5]+'test_'+item[5:] if 'pred' in item else item for item in imgs]

In [7]:
df_image = pd.DataFrame([item.split('_')+[item] for item in imgs], columns=['sub_id', 'source', 'event_id', 'img_order', 'image_path'])
df_image = df_image.astype({'sub_id': 'int64'})

In [8]:
df_train = pd.merge(df, df_image, how='inner', on=['sub_id', 'event_id', 'source'])
df_train['label'] = df_train.apply(lambda row: int(row['event_id'][-2:]), axis=1)

In [10]:
df_train

Unnamed: 0,sub_id,event_id,source,data_HR_activity_median,data_HR_activity_min,data_HR_activity_max,data_HR_activity_average,data_HR_activity_std,data_HR_activity_len,data_LEFT_ACC_MAG_median,...,data_AUTOGRAPHER_RESNET_max_gyromitra,"data_AUTOGRAPHER_RESNET_max_stinkhorn, carrion fungus",data_AUTOGRAPHER_RESNET_max_earthstar,"data_AUTOGRAPHER_RESNET_max_hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa",data_AUTOGRAPHER_RESNET_max_bolete,"data_AUTOGRAPHER_RESNET_max_ear, spike, capitulum","data_AUTOGRAPHER_RESNET_max_toilet tissue, toilet paper, bathroom tissue",img_order,image_path,label
0,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000048,0.000100,0.000287,0.000317,0.00029,0.001594,0.001767,0.jpg,1001_trainA_act01_0.jpg,1
1,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000048,0.000100,0.000287,0.000317,0.00029,0.001594,0.001767,1.jpg,1001_trainA_act01_1.jpg,1
2,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000048,0.000100,0.000287,0.000317,0.00029,0.001594,0.001767,2.jpg,1001_trainA_act01_2.jpg,1
3,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000048,0.000100,0.000287,0.000317,0.00029,0.001594,0.001767,3.jpg,1001_trainA_act01_3.jpg,1
4,1001,act01,trainA,84.507042,76.923077,100.000000,84.577495,3.933112,90,0.000500,...,0.000048,0.000100,0.000287,0.000317,0.00029,0.001594,0.001767,4.jpg,1001_trainA_act01_4.jpg,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1825,1007,act20,trainB,90.909091,85.714286,96.774194,91.244665,2.822364,89,0.025573,...,0.000078,0.002761,0.000641,0.000591,0.00196,0.004449,0.147818,0.jpg,1007_trainB_act20_0.jpg,20
1826,1007,act20,trainB,90.909091,85.714286,96.774194,91.244665,2.822364,89,0.025573,...,0.000078,0.002761,0.000641,0.000591,0.00196,0.004449,0.147818,1.jpg,1007_trainB_act20_1.jpg,20
1827,1007,act20,trainB,90.909091,85.714286,96.774194,91.244665,2.822364,89,0.025573,...,0.000078,0.002761,0.000641,0.000591,0.00196,0.004449,0.147818,2.jpg,1007_trainB_act20_2.jpg,20
1828,1007,act20,trainB,90.909091,85.714286,96.774194,91.244665,2.822364,89,0.025573,...,0.000078,0.002761,0.000641,0.000591,0.00196,0.004449,0.147818,3.jpg,1007_trainB_act20_3.jpg,20


# Training and Predicting

In [9]:
from fastai.tabular import *
from fastai.vision import *
from fastai.metrics import *
from fastai.callbacks import *
from fastai.metrics import error_rate, accuracy
import os
import random

In [10]:
valid_pct = 0.2

valid_index = []
start_idx = 0

for idx, key in enumerate(events_text.keys()): 
    num_entries = df_train[df_train['event_id']==key].shape[0]
    num_valid_idx = round(num_entries * valid_pct)
    idx = random.sample(range(start_idx, start_idx+num_entries), num_valid_idx)
    valid_index += idx
    start_idx += num_entries

In [11]:
dep_var = 'label'
cat_names = []

data_columns = df_train.columns.str.startswith("data_")
data_autographer = [not item for item in df_train.columns.str.startswith("data_AUTOGRAPHER")]
data_mouse = [not item for item in df_train.columns.str.startswith("data_MOUSE")]
cont_names = list(df_train.loc[:, data_columns & data_autographer].columns)

procs = [FillMissing, Categorify, Normalize]

In [34]:
data_autographer

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,

In [33]:
cont_names

['data_HR_activity_median',
 'data_HR_activity_min',
 'data_HR_activity_max',
 'data_HR_activity_average',
 'data_HR_activity_std',
 'data_HR_activity_len',
 'data_LEFT_ACC_MAG_median',
 'data_LEFT_ACC_MAG_min',
 'data_LEFT_ACC_MAG_max',
 'data_LEFT_ACC_MAG_average',
 'data_LEFT_ACC_MAG_std',
 'data_LEFT_ACC_MAG_len',
 'data_LEFT_ACC_X_median',
 'data_LEFT_ACC_X_min',
 'data_LEFT_ACC_X_max',
 'data_LEFT_ACC_X_average',
 'data_LEFT_ACC_X_std',
 'data_LEFT_ACC_X_len',
 'data_LEFT_ACC_Y_median',
 'data_LEFT_ACC_Y_min',
 'data_LEFT_ACC_Y_max',
 'data_LEFT_ACC_Y_average',
 'data_LEFT_ACC_Y_std',
 'data_LEFT_ACC_Y_len',
 'data_LEFT_ACC_Z_median',
 'data_LEFT_ACC_Z_min',
 'data_LEFT_ACC_Z_max',
 'data_LEFT_ACC_Z_average',
 'data_LEFT_ACC_Z_std',
 'data_LEFT_ACC_Z_len',
 'data_RIGHT_ACC_MAG_median',
 'data_RIGHT_ACC_MAG_min',
 'data_RIGHT_ACC_MAG_max',
 'data_RIGHT_ACC_MAG_average',
 'data_RIGHT_ACC_MAG_std',
 'data_RIGHT_ACC_MAG_len',
 'data_RIGHT_ACC_X_median',
 'data_RIGHT_ACC_X_min',
 'dat

In [35]:
len(cont_names)

108

In [39]:
import numpy as np
len(data_autographer) - np.sum(np.asarray(data_autographer))

3000

In [14]:
# data = (TabularList.from_df(df_train, cat_names=cat_names, cont_names=cont_names, procs=procs)
#                            .split_by_rand_pct(valid_pct=0.2, seed=43)
#                            .label_from_df(cols=dep_var)
#                            .add_test(test)
#                            .databunch())
train_tabular_list = TabularList.from_df(df_train, cont_names=cont_names, procs=procs, path='./')
train_image_list = ImageList.from_df(df_train, path=IMG_DIR, cols='image_path')

# test_tabular_list = TabularList.from_df(df_test, cat_names=cat_names, cont_names=cont_names, procs=procs, path='./')
# test_image_list = ImageList.from_df(df_test, path=IMG_DIR, cols='spectrogram_path')

In [15]:
#test_mixed_list = MixedItemList([test_image_list, test_tabular_list], path='./')
        
train_mixed_list = (MixedItemList([train_image_list, train_tabular_list], path='./', inner_df=train_tabular_list.inner_df)
.split_by_idx(valid_index)
.label_from_df(cols=dep_var, label_cls=CategoryList))
#.add_test(test_mixed_list))

In [22]:
data = train_mixed_list.databunch(bs=4)

In [2]:
data.train_ds.x.item_lists[1]

NameError: name 'data' is not defined

In [13]:
class ImageTabularModel(nn.Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs:ListSizes, n_cont:int, layers:Collection[int], ps:Collection[float]=None):
        super().__init__()
        self.cnn = create_body(models.resnet34,pretrained = True) #resnet34 for images
        self.tab = TabularModel(emb_szs, n_cont, 128, layers, use_bn = False, emb_drop = ps) #Tabular model for metadata

        self.reduce = nn.Sequential(*([AdaptiveConcatPool2d(), Flatten()] + bn_drop_lin((1024), 512, bn=True, p=0.3, actn=nn.ReLU(inplace=True)))) #Use this FC layers to reduce nodes
        self.merge = nn.Sequential(*bn_drop_lin(512 + 128, 128, bn=True, actn=nn.ReLU(inplace=True))) #Merge 2 models together
        self.final = nn.Sequential(*bn_drop_lin(128, 20, bn=True, p=0.)) # Last FC layer for regression

    def forward(self, img:Tensor, x:Tensor) -> Tensor:
        imgLatent = self.reduce(self.cnn(img))
        tabLatent = self.tab(x[0],x[1])
        cat = torch.cat([imgLatent, tabLatent], dim=1) #re-define forward func for concat model
        merge = self.merge(cat)
        final = self.final(merge)
        return final 

In [16]:
data = train_mixed_list.databunch(bs=4)
emb = data.train_ds.x.item_lists[1].get_emb_szs()
model = ImageTabularModel(emb, 0, [1000,500], ps=0.2)

learn = Learner(data, model, metrics=[accuracy], loss_func=torch.nn.CrossEntropyLoss())

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /home/hong01/.cache/torch/checkpoints/resnet34-333f7ec4.pth
100%|██████████| 83.3M/83.3M [00:02<00:00, 30.8MB/s]


In [17]:
data.train_ds.y[555]

Category 7

In [18]:
#learn = tabular_learner(data, layers=[2000,1000], metrics=accuracy)

In [19]:
learn.fit_one_cycle(250, 1e-3, callbacks = SaveModelCallback(learn))

epoch,train_loss,valid_loss,accuracy,time
0,2.647188,2.576297,0.147541,05:52


KeyboardInterrupt: 

In [None]:
learn.save('stage-1')

In [None]:
learn.load('stage-1')
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
# learn.unfreeze()
# learn.fit_one_cycle(5, slice(1e-3), callbacks = SaveModelCallback(learn))

In [None]:
# predictions, *_ = learn.get_preds(DatasetType.Test)
# labels = np.argmax(predictions, 1)
# df_test['label'] = labels

In [None]:
pred_test, y_test = learn.get_preds()

In [None]:
pred_test

In [None]:
torch.cuda.empty_cache()

In [14]:
import joblib

learn = joblib.load('learn.joblib')

EOFError: 