In [1]:
%%time
import joblib, time, os, copy, datetime, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torchvision

#enable it when running in google cloud, to upload/download file from/to VM to/from google cloud bucket
from google.cloud import storage

reload_data = False
bucket_root_path = "weicheng30417"
project_data_folder = "data/breakfast-img-data/"
torch.manual_seed(0)

CPU times: user 388 ms, sys: 36 ms, total: 424 ms
Wall time: 428 ms


<torch._C.Generator at 0x7fe8440b0d10>

In [2]:
def download_all_data_from_bucket():
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    file_id = project_data_folder
    blobs=bucket.list_blobs(prefix=file_id)
    for blob in blobs:        
        file_names = blob.name.split("/")
        if len(file_names) == 3:
            if file_names[2] != "":
                file_name = file_names[2]            
                blob.download_to_filename(file_name)
                print("Download from {0} to local {1}".format(blob.name, file_name))
                

def download_file_from_bucket(file):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    butcketFile = project_data_folder + file
    blob = bucket.blob(butcketFile)
    blob.download_to_filename(file)
    print("Download from {0} to local {1}".format(butcketFile, file))
    
                
def upload_files(files):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path) 
    for file in files:
        butcketFile = project_data_folder + file
        blob = bucket.blob(butcketFile)
        blob.upload_from_filename(file)
        print("Upload from local {0} to {1}".format(file, butcketFile))
  
        
def upload_all_files():    
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_root_path)
    
    directory = os.getcwd()
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".csv") or filename.endswith(".ipynb") or filename.endswith(".model"):             
            butcketFile = project_data_folder + file
            blob = bucket.blob(butcketFile)
            blob.upload_from_filename(filename)
            print("Upload from local {0} to {1}".format(filename, butcketFile))
        
def flatten(ls):
     for item in ls:
            for child in list(item):
                yield child

def get_segment_positions(x):
    ps = x.segment.split()
    ls = []
    for i in range(len(ps)-1):
        if i == 0:
            ls.append([int(ps[i]), int(ps[i+1])])
        else:
            ls.append([int(ps[i])+1, int(ps[i+1])])        
    return ls

def get_segment_features(x):
    ls = []
    for rg in x.positions:
        ls.append(x.feature[rg[0]:rg[1]])            
    return ls

def splitDataFrameList(df,target_column):
    def splitListToRows(row,row_accumulator,target_column):
        split_row = row[target_column]
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column))
    new_df = pd.DataFrame(new_rows)
    return new_df
                
def get_train_data():
    training_segment = pd.read_csv('training_segment.txt', header=None, names = ['segment'])
    training_segment['feature'] = joblib.load('train_feature.joblib')
    training_segment['positions'] = training_segment.apply(lambda x: get_segment_positions(x), axis=1)
    training_segment['feature'] = training_segment.apply(lambda x: get_segment_features(x), axis=1)
    training_segment = splitDataFrameList(training_segment, 'feature')
    training_segment['label'] = list(flatten(joblib.load('train_label.joblib')))
    training_segment = training_segment.drop(['segment','positions'], axis = 1)
    return training_segment

def get_test_data():
    test_segment = pd.read_csv('test_segment.txt', header=None, names = ['segment'])
    test_segment['feature'] = joblib.load('test_feature.joblib')
    test_segment['positions'] = test_segment.apply(lambda x: get_segment_positions(x), axis=1)
    test_segment['feature'] = test_segment.apply(lambda x: get_segment_features(x), axis=1)
    test_segment = splitDataFrameList(test_segment, 'feature')
    test_segment['ID'] = test_segment.index
    test_segment = test_segment.drop(['segment','positions'], axis = 1)
    return test_segment

def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def train_model(model, dataloaders, criterion, optimizer, num_epochs=25, is_inception=False):
    since = time.time()

    val_acc_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    for epoch in range(num_epochs):
        ms = time.time()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:                
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    if is_inception and phase == 'train':
                        outputs, aux_outputs = model(inputs)
                        loss1 = criterion(outputs, labels)
                        loss2 = criterion(aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)           

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                val_acc_history.append(epoch_acc)            
            print('{} Loss: {:.4f}, {} Acc: {:.4f}'.format(phase, epoch_loss, phase, epoch_acc))
        
        time_taken = str(datetime.timedelta(seconds=time.time() - ms))
        print('time taken: {}'.format(time_taken))
            

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    model.load_state_dict(best_model_wts)
    return model, val_acc_history

In [3]:
%%time
from torch.nn import *
from torch.nn.functional import *

class MyNet(Module):
    def __init__(self, batch_size, num_classes):
        super(MyNet, self).__init__()
        self.conv1 = Conv2d(in_channels=1, out_channels=batch_size, kernel_size=3, padding=1)
        self.conv1_bn = BatchNorm2d(num_features=batch_size)
        self.conv2 = Conv2d(in_channels=batch_size, out_channels=batch_size*2, kernel_size=3, padding=1)
        self.conv2_bn = BatchNorm2d(num_features=batch_size*2)
        self.conv3 = Conv2d(in_channels=batch_size*2, out_channels=batch_size*4, kernel_size=3, padding=1)
        self.conv3_bn = BatchNorm2d(num_features=batch_size*4)
        self.conv4 = Conv2d(in_channels=batch_size*4, out_channels=batch_size*8, kernel_size=3, padding=1)
        self.conv4_bn = BatchNorm2d(num_features=batch_size*8)
        self.pool = MaxPool2d(2, 2)
        self.drop1 = Dropout(0.5)
        self.linear1 = Linear(in_features=batch_size*8, out_features=batch_size*4)
        self.linear2 = Linear(in_features=batch_size*4, out_features=num_classes)
    
    def forward(self, x):
        x = self.pool(relu(self.conv1_bn(self.conv1(x))))
        #print("conv1", x.shape)
        x = self.pool(relu(self.conv2_bn(self.conv2(x))))
        #print("conv2",x.shape)
        x = self.pool(relu(self.conv3_bn(self.conv3(x))))
        #print("conv3",x.shape)
        x = self.pool(relu(self.conv4_bn(self.conv4(x))))
        #print("conv4",x.shape)
        x = self.drop1(x)
        x = x.view(-1, 512) 
        #print("x.view",x.shape)
        x = relu(self.linear1(x))
        #print("linear1",x.shape)
        x = self.drop1(x) 
        #x = softmax(self.linear2(x))
        x = self.linear2(x)
        return x

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 129 µs


In [4]:
%%time
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
#turn on cpu for troubleshooting as gpu doesnt throw proper error message, runing in cpu shows more specific error message
#device = torch.device("cpu") 
print(device)

model_version = '1.2'
model_name = "model_" + model_version +".model"

num_classes = 47
batch_size = 64

model = MyNet(batch_size, num_classes).double().to(device)

model_dict = model.state_dict()
trained_model_dict = torch.load(model_name, map_location=device)
for k in model_dict:
    model_dict[k] = trained_model_dict["module." + k]
    
model.load_state_dict(model_dict)

cuda:0
CPU times: user 3.16 s, sys: 1.08 s, total: 4.24 s
Wall time: 2.87 s


<All keys matched successfully>

In [5]:
%%time
test_df = get_test_data()
test_df = splitDataFrameList(test_df, 'feature')
test_df['feature'] = test_df.apply(lambda x : x["feature"].view(1, 20, 20), axis = 1)

test_feature = torch.stack(test_df['feature'].tolist())
test_dataset = torch.utils.data.TensorDataset(test_feature)
predict_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

CPU times: user 17.4 s, sys: 2.13 s, total: 19.5 s
Wall time: 19.5 s


In [6]:
%%time
result_max = []
result_mean = []
with torch.no_grad():
    for data in predict_loader:
        inputs = data[0].to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        result_max = np.append(result_max, predicted.cpu())
        
        prop = torch.nn.functional.softmax(outputs).cpu().tolist()
        result_mean = np.append(result_mean, prop)

  # Remove the CWD from sys.path while we load stuff.


CPU times: user 3min 47s, sys: 2min 32s, total: 6min 20s
Wall time: 6min 20s


In [7]:
%%time
test_data_count = test_df.sample(frac = 1.0)
test_data_count['label'] = np.transpose(result_max).astype('int') 
test_data_count = test_data_count.groupby('ID')['label'].apply(list).reset_index(name='labels')
test_data_count['Category'] = test_data_count.apply(lambda x : np.argmax(np.bincount(x.labels)) + 1, axis = 1)
test_data_count = test_data_count[['ID','Category']]
print(len(test_data_count.Category.unique()))
test_data_count

7
CPU times: user 300 ms, sys: 4 ms, total: 304 ms
Wall time: 306 ms


Unnamed: 0,ID,Category
0,0,32
1,1,29
2,2,29
3,3,29
4,4,13
...,...,...
1279,1279,29
1280,1280,32
1281,1281,29
1282,1282,29


In [8]:
%%time
submission_max = "submission_"  + model_version + "_count_test.csv"
test_data_count.to_csv(submission_max, index=False)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.01 ms


In [9]:
%%time
test_data_mean = pd.DataFrame(result_mean.reshape(-1,47))
test_data_mean['ID'] = test_df['ID'].values
test_data_mean = test_data_mean.groupby('ID')[list(range(47))].agg(['mean'])
test_data_mean.columns = list(range(47))
test_data_mean["Category"] = test_data_mean.idxmax(axis = 1)
test_data_mean['Category'] = test_data_mean.apply(lambda x : x["Category"] + 1, axis = 1).astype('int') 
test_data_mean["ID"] = test_data_mean.index
test_data_mean = test_data_mean[['ID','Category']]
print(len(test_data_mean.Category.unique()))
test_data_mean

25
CPU times: user 512 ms, sys: 84 ms, total: 596 ms
Wall time: 597 ms


Unnamed: 0_level_0,ID,Category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,29
1,1,32
2,2,2
3,3,11
4,4,29
...,...,...
1279,1279,44
1280,1280,17
1281,1281,15
1282,1282,2


In [10]:
submission_mean = "submission_"  + model_version + "_mean_test.csv"
test_data_mean.to_csv(submission_mean, index=False)

In [11]:
%%time
test_data_max = pd.DataFrame(result_mean.reshape(-1,47))
test_data_max['ID'] = test_df['ID'].values
test_data_max = test_data_max.groupby('ID')[list(range(47))].agg(['sum'])
test_data_max.columns = list(range(47))
test_data_max["Category"] = test_data_max.idxmax(axis = 1)
test_data_max['Category'] = test_data_max.apply(lambda x : x["Category"] + 1, axis = 1).astype('int') 
test_data_max["ID"] = test_data_max.index
test_data_max = test_data_max[['ID','Category']]
print(len(test_data_max.Category.unique()))
test_data_max

35
CPU times: user 544 ms, sys: 68 ms, total: 612 ms
Wall time: 611 ms


Unnamed: 0_level_0,ID,Category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,14
1,1,1
2,2,11
3,3,21
4,4,22
...,...,...
1279,1279,44
1280,1280,17
1281,1281,13
1282,1282,2


In [12]:
%%time
submission_max = "submission_"  + model_version + "_max_test.csv"
test_data_max.to_csv(submission_max, index=False)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 2.44 ms


In [13]:
%%time
upload_all_files()

Upload from local submission_1.2_max.csv to data/breakfast-img-data/submission_1.2_max.csv
Upload from local submission_1.2_max_test.csv to data/breakfast-img-data/submission_1.2_max_test.csv
Upload from local submission_1.2_mean.csv to data/breakfast-img-data/submission_1.2_mean.csv
Upload from local submission_2.1_mean.csv to data/breakfast-img-data/submission_2.1_mean.csv
Upload from local submission_2.0_mean.csv to data/breakfast-img-data/submission_2.0_mean.csv
Upload from local model_2.0.model to data/breakfast-img-data/model_2.0.model
Upload from local submission_2.0_max.csv to data/breakfast-img-data/submission_2.0_max.csv
Upload from local submission_1.2_mean_test.csv to data/breakfast-img-data/submission_1.2_mean_test.csv
Upload from local submission_1.0_mean.csv to data/breakfast-img-data/submission_1.0_mean.csv
Upload from local model_2.1.model to data/breakfast-img-data/model_2.1.model
Upload from local model_1.0.model to data/breakfast-img-data/model_1.0.model
Upload from