In [2]:
%%time
reload_data = False

import joblib, time, os, copy, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision

#enable it when running in google cloud, to upload/download file from/to VM to/from google cloud bucket
from google.cloud import storage

bucket_root_path = "dataproc-6ca41800-27b4-47d5-abee-55c011dfa389-asia-southeast1"
project_data_folder = "data/breakfast-img-data/"

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 30 µs


In [3]:
def download_all_data_from_bucket():
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    file_id = project_data_folder
    blobs=bucket.list_blobs(prefix=file_id)
    for blob in blobs:        
        file_names = blob.name.split("/")
        if len(file_names) == 3:
            if file_names[2] != "":
                file_name = file_names[2]            
                blob.download_to_filename(file_name)
                print("Download from {0} to local {1}".format(blob.name, file_name))
                

def download_file_from_bucket(file):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    butcketFile = project_data_folder + file
    blob = bucket.blob(butcketFile)
    blob.download_to_filename(file)
    print("Download from {0} to local {1}".format(butcketFile, file))
    
                
def upload_files(files):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    for file in files:
        butcketFile = project_data_folder + file
        blob = bucket.blob(butcketFile)
        blob.upload_from_filename(file)
        print("Upload from local {0} to {1}".format(file, butcketFile))

def flatten(ls):
     for item in ls:
            for child in list(item):
                yield child

def get_segment_positions(x):
    ps = x.segment.split()
    ls = []
    for i in range(len(ps)-1):
        if i == 0:
            ls.append([int(ps[i]), int(ps[i+1])])
        else:
            ls.append([int(ps[i])+1, int(ps[i+1])])        
    return ls

def get_segment_features(x):
    ls = []
    for rg in x.positions:
        ls.append(x.feature[rg[0]:rg[1]])            
    return ls

def splitDataFrameList(df,target_column):
    def splitListToRows(row,row_accumulator,target_column):
        split_row = row[target_column]
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df
                
def get_train_data():
    training_segment = pd.read_csv('training_segment.txt', header=None, names = ['segment'])
    training_segment['feature'] = joblib.load('train_feature.joblib')
    training_segment['positions'] = training_segment.apply(lambda x: get_segment_positions(x), axis=1)
    training_segment['feature'] = training_segment.apply(lambda x: get_segment_features(x), axis=1)
    training_segment = splitDataFrameList(training_segment, 'feature')
    training_segment['label'] = list(flatten(joblib.load('train_label.joblib')))
    training_segment = training_segment.drop(['segment','positions'], axis = 1)
    return training_segment

def get_test_data():
    test_segment = pd.read_csv('test_segment.txt', header=None, names = ['segment'])
    test_segment['feature'] = joblib.load('test_feature.joblib')
    test_segment['positions'] = test_segment.apply(lambda x: get_segment_positions(x), axis=1)
    test_segment['feature'] = test_segment.apply(lambda x: get_segment_features(x), axis=1)
    test_segment = splitDataFrameList(test_segment, 'feature')
    test_segment['ID'] = test_segment.index
    test_segment = test_segment.drop(['segment','positions'], axis = 1)
    return test_segment

def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        ms = time.time()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    if is_inception and phase == 'train':
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)           

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)            
            print('{} Loss: {:.4f}, {} Acc: {:.4f}'.format(phase, epoch_loss, phase, epoch_acc))
        
        time_taken = str(datetime.timedelta(seconds=time.time() - ms))
        print('time taken: {}'.format(time_taken))
            

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model, val_acc_history

class MyNet(torch.nn.Module):
    def __init__(self, num_classes):
        super(MyNet, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 64, 3, padding=1)
        self.conv2 = torch.nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = torch.nn.Conv2d(128, 256, 3, padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.linear1 = torch.nn.Linear(1024, 512)
        self.linear2 = torch.nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.pool(torch.nn.functional.relu(self.conv1(x)))
        x = self.pool(torch.nn.functional.relu(self.conv2(x)))
        x = self.pool(torch.nn.functional.relu(self.conv3(x)))
        x = x.view(-1, 1024)
        x = torch.nn.functional.relu(self.linear1(x))
        #x = torch.nn.functional.softmax(self.linear2(x))
        x = self.linear2(x)
        return x 

In [4]:
%%time
if reload_data:
    download_all_data_from_bucket()

Download from data/breakfast-img-data/test_feature.joblib to local test_feature.joblib
Download from data/breakfast-img-data/test_segment.txt to local test_segment.txt
Download from data/breakfast-img-data/train_feature.joblib to local train_feature.joblib
Download from data/breakfast-img-data/train_label.joblib to local train_label.joblib
Download from data/breakfast-img-data/training_segment.txt to local training_segment.txt
CPU times: user 1min 1s, sys: 18.8 s, total: 1min 20s
Wall time: 2min 53s


In [5]:
%%time
train_df = get_train_data()
print(train_df.feature[0].shape)
train_df = splitDataFrameList(train_df, 'feature')
print(train_df.info())

torch.Size([260, 400])
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2855174 entries, 0 to 2855173
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   feature  object
 1   label    int64 
dtypes: int64(1), object(1)
memory usage: 43.6+ MB
None
CPU times: user 42.5 s, sys: 11.5 s, total: 54 s
Wall time: 1min 5s


In [6]:
%%time
train_sample_ratio = 1.0
train_data = train_df.sample(frac=train_sample_ratio)
train_data['feature'] = train_data.apply(lambda x : x["feature"].view(1, 20, 20), axis = 1)
train_data['label'] = train_data.apply(lambda x : x["label"] - 1, axis = 1)
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2855174 entries, 26595 to 1781487
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   feature  object
 1   label    int64 
dtypes: int64(1), object(1)
memory usage: 65.3+ MB
None
CPU times: user 1min 40s, sys: 752 ms, total: 1min 40s
Wall time: 1min 40s


In [7]:
%%time
num_classes = 47
batch_size = 64
num_epochs = 10
feature_extract = True

train_feature = torch.stack(train_data['feature'].tolist())
train_label = torch.tensor(train_data['label'].values.astype(np.long))

train_dataset = torch.utils.data.TensorDataset(train_feature, train_label)

train_size = int(0.98 * len(train_data))
val_size = int(0.01 * len(train_data))
test_size = len(train_data) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size, test_size])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

dataloaders_dict = {'train':train_loader, 'val':test_loader}
print("len(train_loader.dataset) = ", len(train_loader.dataset))
print("len(val_loader.dataset) = ", len(val_loader.dataset))
print("len(test_loader.dataset) = ", len(test_loader.dataset))

len(train_loader.dataset) =  2798070
len(val_loader.dataset) =  28551
len(test_loader.dataset) =  28553
CPU times: user 9.54 s, sys: 3.9 s, total: 13.4 s
Wall time: 12 s


In [9]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#turn on cpu for troubleshooting as gpu doesnt throw proper error message, runing in cpu shows more specific error message
#device = torch.device("cpu") 
print(device)
    
model_ft = MyNet(num_classes).double()
print(model_ft)
model_ft = model_ft.to(device)

params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

optimizer_ft = torch.optim.SGD(params_to_update, lr=0.001, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs, is_inception=False)

cuda:0
MyNet(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (linear1): Linear(in_features=1024, out_features=512, bias=True)
  (linear2): Linear(in_features=512, out_features=47, bias=True)
)
Params to learn:
	 conv1.weight
	 conv1.bias
	 conv2.weight
	 conv2.bias
	 conv3.weight
	 conv3.bias
	 linear1.weight
	 linear1.bias
	 linear2.weight
	 linear2.bias
Epoch 0/9
----------
train Loss: 1.1740, train Acc: 0.6519
val Loss: 0.7040, val Acc: 0.7775
time taken: 0:34:56.849685
Epoch 1/9
----------
train Loss: 0.5160, train Acc: 0.8353
val Loss: 0.4248, val Acc: 0.8634
time taken: 0:34:54.456245
Epoch 2/9
----------
train Loss: 0.3287, train Acc: 0.8925
val Loss: 0.3323, val Acc: 0.8922
time taken: 0:34:54.009128
Epoch 3/9

In [None]:
%%time
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data[0].to(device), data[1].to(device)
        outputs = model_ft(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of test data: {0}'.format(100 * correct / total))

Accuracy of test data: 94.20025916716283
CPU times: user 4.6 s, sys: 1.6 s, total: 6.2 s
Wall time: 6.2 s


In [None]:
%%time
model_name = "model_1.0.model"
torch.save(model_ft.state_dict(), model_name)
upload_files([model_name])

Upload from local model_1.0.model to data/breakfast-img-data/model_1.0.model
CPU times: user 52 ms, sys: 36 ms, total: 88 ms
Wall time: 553 ms


In [14]:
%%time
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#turn on cpu for troubleshooting as gpu doesnt throw proper error message, runing in cpu shows more specific error message
#device = torch.device("cpu") 
print(device)

model_name = "model_1.0.model"
model = MyNet(47).double().to(device)
model.load_state_dict(torch.load(model_name, map_location=device)) 

cuda:0
CPU times: user 72 ms, sys: 4 ms, total: 76 ms
Wall time: 18 ms


<All keys matched successfully>

In [15]:
%%time
test_df = get_test_data()
test_df = splitDataFrameList(test_df, 'feature')
test_df

CPU times: user 6.63 s, sys: 1.69 s, total: 8.32 s
Wall time: 8.32 s


Unnamed: 0,feature,ID
0,"[tensor(-15.8486, dtype=torch.float64), tensor...",0
1,"[tensor(-15.7037, dtype=torch.float64), tensor...",0
2,"[tensor(-15.1998, dtype=torch.float64), tensor...",0
3,"[tensor(-15.2123, dtype=torch.float64), tensor...",0
4,"[tensor(-16.0185, dtype=torch.float64), tensor...",0
...,...,...
468908,"[tensor(-16.3263, dtype=torch.float64), tensor...",1283
468909,"[tensor(-14.9555, dtype=torch.float64), tensor...",1283
468910,"[tensor(-16.1000, dtype=torch.float64), tensor...",1283
468911,"[tensor(-15.5954, dtype=torch.float64), tensor...",1283


In [16]:
%%time
test_data = test_df.sample(frac = 1.0)
print(test_data.info())
test_data['feature'] = test_data.apply(lambda x : x["feature"].view(1, 20, 20), axis = 1)
test_feature = torch.stack(test_data['feature'].tolist())
test_dataset = torch.utils.data.TensorDataset(test_feature)
predict_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)
result = []
with torch.no_grad():
    for data in predict_loader:
        inputs = data[0].to(device)
        outputs = model(inputs)  
        prop = torch.nn.functional.softmax(outputs).cpu().tolist()
        result = np.append(result, prop)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 468913 entries, 427583 to 313403
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   feature  468913 non-null  object
 1   ID       468913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.7+ MB
None


  if sys.path[0] == '':


CPU times: user 3min 48s, sys: 1min 25s, total: 5min 13s
Wall time: 5min 13s


In [23]:
%%time
test_result = pd.DataFrame(result.reshape(-1,47))
test_result['ID'] = test_data['ID'].values
test_data_mean = test_result.groupby('ID')[list(range(47))].agg(['mean'])
test_data_mean.columns = list(range(47))
test_data_mean["Category"] = test_data_mean.idxmax(axis = 1)
test_data_mean['Category'] = test_data_mean.apply(lambda x : x["Category"] + 1, axis = 1).astype('int') 
test_data_mean["ID"] = test_data_mean.index
test_data_group = test_data_mean[['ID','Category']]
print(test_data_group.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1284 entries, 0 to 1283
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ID        1284 non-null   int64
 1   Category  1284 non-null   int64
dtypes: int64(2)
memory usage: 30.1 KB
None
CPU times: user 536 ms, sys: 68 ms, total: 604 ms
Wall time: 600 ms


In [27]:
len(test_data_group.Category.unique())

37

In [26]:
submission_name = "submission_1.0_mean.csv"
test_data_group.to_csv(submission_name, index=False)

In [20]:
upload_files([submission_name])

Upload from local submission_1_baseline_mean to data/breakfast-img-data/submission_1_baseline_mean


In [30]:
%%time
test_df = get_test_data()
test_df = splitDataFrameList(test_df, 'feature')
test_df['feature'] = test_df.apply(lambda x : x["feature"].view(1, 20, 20), axis = 1)
test_feature = torch.stack(test_df['feature'].tolist())
test_dataset = torch.utils.data.TensorDataset(test_feature)
predict_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
result = []
with torch.no_grad():
    for data in predict_loader:
        inputs = data[0].to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        result = np.append(result, predicted.cpu())

test_df['label'] = np.transpose(result).astype('int') 
test_df

CPU times: user 1min 33s, sys: 29.3 s, total: 2min 2s
Wall time: 2min 2s


Unnamed: 0,feature,ID,label
0,"[[[tensor(-15.8486, dtype=torch.float64), tens...",0,16
1,"[[[tensor(-15.7037, dtype=torch.float64), tens...",0,13
2,"[[[tensor(-15.1998, dtype=torch.float64), tens...",0,13
3,"[[[tensor(-15.2123, dtype=torch.float64), tens...",0,16
4,"[[[tensor(-16.0185, dtype=torch.float64), tens...",0,34
...,...,...,...
468908,"[[[tensor(-16.3263, dtype=torch.float64), tens...",1283,10
468909,"[[[tensor(-14.9555, dtype=torch.float64), tens...",1283,10
468910,"[[[tensor(-16.1000, dtype=torch.float64), tens...",1283,10
468911,"[[[tensor(-15.5954, dtype=torch.float64), tens...",1283,44


In [31]:
%%time
test_data_group = test_df.groupby('ID')['label'].apply(list).reset_index(name='labels')
test_data_group['Category'] = test_data_group.apply(lambda x : np.argmax(np.bincount(x.labels)) + 1, axis = 1)
test_data_group = test_data_group[['ID','Category']]
test_data_group

CPU times: user 212 ms, sys: 4 ms, total: 216 ms
Wall time: 218 ms


Unnamed: 0,ID,Category
0,0,14
1,1,2
2,2,2
3,3,2
4,4,14
...,...,...
1279,1279,44
1280,1280,11
1281,1281,15
1282,1282,2


In [32]:
test_data_group.Category.unique()

array([14,  2,  5, 10, 11, 13, 15, 21, 32, 22, 34, 44, 37, 26, 39, 29, 20,
        1,  3, 46, 25, 19, 27,  4, 35, 36,  6, 33, 17, 12, 30, 38, 24, 45,
       28, 43, 42])

In [35]:
len(test_data_group.Category.unique())

37

In [33]:
submission_name = "submission_1.0_max.csv"
test_data_group.to_csv(submission_name, index=False)

In [34]:
upload_files([submission_name])

Upload from local submission_1_baseline_max.csv to data/breakfast-img-data/submission_1_baseline_max.csv
