In [2]:
%%time
reload_data = False

!pip install --upgrade pip
!pip install joblib
!pip install gcsfs

import joblib, time, os, copy, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision

#enable it when running in google cloud, to upload/download file from/to VM to/from google cloud bucket
from google.cloud import storage

bucket_root_path = "dataproc-6ca41800-27b4-47d5-abee-55c011dfa389-asia-southeast1"
project_data_folder = "data/breakfast-img-data/"

Requirement already up-to-date: pip in /opt/conda/lib/python3.7/site-packages (20.0.2)
CPU times: user 456 ms, sys: 68 ms, total: 524 ms
Wall time: 6.24 s


In [3]:
def download_all_data_from_bucket():
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    file_id = project_data_folder
    blobs=bucket.list_blobs(prefix=file_id)
    for blob in blobs:        
        file_names = blob.name.split("/")
        if len(file_names) == 3:
            if file_names[2] != "":
                file_name = file_names[2]            
                blob.download_to_filename(file_name)
                print("Download from {0} to local {1}".format(blob.name, file_name))
                

def download_files_from_bucket(files):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    for file in files:
        butcketFile = project_data_folder + file
        blob = bucket.blob(butcketFile)
        blob.download_to_filename(file)
        print("Download from {0} to local {1}".format(butcketFile, file))
    
                
def upload_files(files):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    for file in files:
        butcketFile = project_data_folder + file
        blob = bucket.blob(butcketFile)
        blob.upload_from_filename(file)
        print("Upload from local {0} to {1}".format(file, butcketFile))

def flatten(ls):
     for item in ls:
            for child in list(item):
                yield child

def get_segment_positions(x):
    ps = x.segment.split()
    ls = []
    for i in range(len(ps)-1):
        if i == 0:
            ls.append([int(ps[i]), int(ps[i+1])])
        else:
            ls.append([int(ps[i])+1, int(ps[i+1])])        
    return ls

def get_segment_features(x):
    ls = []
    for rg in x.positions:
        ls.append(x.feature[rg[0]:rg[1]])            
    return ls

def splitDataFrameList(df,target_column):
    def splitListToRows(row,row_accumulator,target_column):
        split_row = row[target_column]
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df

def get_test_data():
    test_segment = pd.read_csv('test_segment.txt', header=None, names = ['segment'])
    test_segment['feature'] = joblib.load('test_feature.joblib')
    test_segment['positions'] = test_segment.apply(lambda x: get_segment_positions(x), axis=1)
    test_segment['feature'] = test_segment.apply(lambda x: get_segment_features(x), axis=1)
    test_segment = splitDataFrameList(test_segment, 'feature')
    test_segment['ID'] = test_segment.index
    test_segment = test_segment.drop(['segment','positions'], axis = 1)
    return test_segment


In [5]:
%%time
if reload_data:
    download_files_from_bucket(['test_feature.joblib','test_segment.txt'])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs


In [8]:
%%time
class MyNet(torch.nn.Module):
    def __init__(self, num_classes):
        super(MyNet, self).__init__()
        self.conv1 = torch.nn.Conv2d(1, 64, 3, padding=1)
        self.conv2 = torch.nn.Conv2d(64, 128, 3, padding=1)
        self.conv3 = torch.nn.Conv2d(128, 256, 3, padding=1)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.linear1 = torch.nn.Linear(1024, 512)
        self.linear2 = torch.nn.Linear(512, num_classes)
    
    def forward(self, x):
        x = self.pool(torch.nn.functional.relu(self.conv1(x)))
        x = self.pool(torch.nn.functional.relu(self.conv2(x)))
        x = self.pool(torch.nn.functional.relu(self.conv3(x)))
        x = x.view(-1, 1024)
        x = torch.nn.functional.relu(self.linear1(x))
        #x = torch.nn.functional.softmax(self.linear2(x))
        x = self.linear2(x)
        return x

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#turn on cpu for troubleshooting as gpu doesnt throw proper error message, runing in cpu shows more specific error message
#device = torch.device("cpu") 
print(device)

train_sample_ratio = 1.0
model_name = "model_"  + str(train_sample_ratio) + ".model"
model = MyNet(47).double().to(device)
model.load_state_dict(torch.load(model_name, map_location=device)) 

cuda:0
CPU times: user 60 ms, sys: 8 ms, total: 68 ms
Wall time: 14.9 ms


<All keys matched successfully>

In [9]:
%%time
test_df = get_test_data()
test_df = splitDataFrameList(test_df, 'feature')
test_df['feature'] = test_df.apply(lambda x : x["feature"].view(1, 20, 20), axis = 1)
test_feature = torch.stack(test_df['feature'].tolist())
test_dataset = torch.utils.data.TensorDataset(test_feature)
predict_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True)
result = []
with torch.no_grad():
    for data in predict_loader:
        inputs = data[0].to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        result = np.append(result, predicted.cpu())

test_df['label'] = np.transpose(result).astype('int') 
test_df

CPU times: user 1min 34s, sys: 29.2 s, total: 2min 3s
Wall time: 2min 2s


Unnamed: 0,feature,ID,label
0,"[[[tensor(-15.8486, dtype=torch.float64), tens...",0,28
1,"[[[tensor(-15.7037, dtype=torch.float64), tens...",0,42
2,"[[[tensor(-15.1998, dtype=torch.float64), tens...",0,14
3,"[[[tensor(-15.2123, dtype=torch.float64), tens...",0,0
4,"[[[tensor(-16.0185, dtype=torch.float64), tens...",0,10
...,...,...,...
468908,"[[[tensor(-16.3263, dtype=torch.float64), tens...",1283,45
468909,"[[[tensor(-14.9555, dtype=torch.float64), tens...",1283,1
468910,"[[[tensor(-16.1000, dtype=torch.float64), tens...",1283,16
468911,"[[[tensor(-15.5954, dtype=torch.float64), tens...",1283,28


In [18]:
%%time
test_data_group = test_df.groupby('ID')['label'].apply(list).reset_index(name='labels')
test_data_group['Category'] = test_data_group.apply(lambda x : np.argmax(np.bincount(x.labels)) + 1, axis = 1)
test_data_group = test_data_group[['ID','Category']]
test_data_group

CPU times: user 220 ms, sys: 0 ns, total: 220 ms
Wall time: 217 ms


Unnamed: 0,ID,Category
0,0,11
1,1,29
2,2,11
3,3,15
4,4,32
...,...,...
1279,1279,2
1280,1280,11
1281,1281,2
1282,1282,13


In [19]:
test_data_group.Category.unique()

array([11, 29, 15, 32, 13,  2, 20, 17, 34, 27, 24, 14, 37])

In [20]:
submission_name = "submission_"  + str(train_sample_ratio) + "_max.csv"
test_data_group.to_csv(submission_name, index=False)

In [21]:
upload_files([submission_name])

Upload from local submission_1.0_max.csv to data/breakfast-img-data/submission_1.0_max.csv
