In [1]:
""" REFERENCES
https://github.com/ChiWeiHsiao/DeepVO-pytorch

Original pipeline:

- Download data
- Preprocessing
- Load pretrained model
- Change hyperparams
- Train (run main)
- Evaluate (run test
- Visualise results
"""

# Data Download

Already downloaded from the AISG-SLA website, located in `data`

In [3]:
import os
from params import par
import pandas as pd

# Get the start and end indices for each sequence from the "train_labels.csv" file
def get_sequence_ranges():
    data = pd.read_csv(os.path.join(par.data_dir, 'train_labels.csv'))
    data['Timestamp'] = pd.to_datetime(data['Timestamp'])  # Convert column from "Timestamp" to type datetime

    # Filter only the rows corresponding to image files
    data = data[data['Filename'].str.endswith('.jpg')]

    # Sort data by"Timestamp"
    data = data.sort_values(by='Timestamp')

    sequence_ranges = {}  # Dictionary to store start and end indices for each sequence

    for trajectory_id, group in data.groupby('TrajectoryId'):
        indices = group.index.values  # Get the indices of the rows corresponding to this sequence
        sequence_ranges[trajectory_id] = [min(indices), max(indices)]

    return sequence_ranges

# Get the dictionary of sequences and their ranges
sequence_ranges_dict = get_sequence_ranges()

# Print dict
print(sequence_ranges_dict)

{1: [0, 1398]}


# Preprocessing

In [6]:
import glob
import numpy as np
import time
from helper import R_to_angle
from params import par
from torchvision import transforms
from PIL import Image
import torch
import math


# transform poseGT [R|t] to [theta_x, theta_y, theta_z, x, y, z]
# save as .npy file
def create_pose_data():
    sequence_ranges = get_sequence_ranges()
    start_t = time.time()
    for trajectory_id, indices in sequence_ranges.items():
        fn = os.path.join(par.pose_dir, '{:02d}.txt'.format(trajectory_id))
        print('Transforming {}...'.format(fn))

        # Leer los datos de poses desde el archivo CSV
        data = pd.read_csv(os.path.join(par.data_dir, 'train_labels.csv'))
        pose_data = data[data['TrajectoryId'] == trajectory_id][['Easting', 'Northing', 'Height', 'Roll', 'Pitch', 'Yaw']].values

        # Normalizar los datos de poses
        pose_data[:, :3] = pose_data[:, :3] - pose_data[:, :3].mean(axis=0)
        pose_data[:, 3:] = pose_data[:, 3:] - pose_data[:, 3:].mean(axis=0)

        # Procesar los datos de poses y guardarlos como un archivo .npy
        np.save(os.path.join(par.data_dir, 'pose_GT', 'poses_{}.npy'.format(trajectory_id)), pose_data)

        print('Trajectory {}: shape={}'.format(trajectory_id, pose_data.shape))
    print('elapsed time = {}'.format(time.time() - start_t))

def calculate_rgb_mean_std(image_path_list, minus_point_5=False):
    n_images = len(image_path_list)
    cnt_pixels = 0
    print('Numbers of frames in training dataset: {}'.format(n_images))
    mean_np = [0, 0, 0]
    mean_tensor = [0, 0, 0]
    to_tensor = transforms.ToTensor()

    image_sequence = []
    for idx, img_path in enumerate(image_path_list):
        print('{} / {}'.format(idx, n_images), end='\r')
        img_as_img = Image.open(img_path)
        img_as_tensor = to_tensor(img_as_img)
        if minus_point_5:
            img_as_tensor = img_as_tensor - 0.5
        img_as_np = np.array(img_as_img)
        img_as_np = np.rollaxis(img_as_np, 2, 0)
        cnt_pixels += img_as_np.shape[1]*img_as_np.shape[2]
        for c in range(3):
            mean_tensor[c] += float(torch.sum(img_as_tensor[c]))
            mean_np[c] += float(np.sum(img_as_np[c]))
    mean_tensor =  [v / cnt_pixels for v in mean_tensor]
    mean_np = [v / cnt_pixels for v in mean_np]
    print('mean_tensor = ', mean_tensor)
    print('mean_np = ', mean_np)

    std_tensor = [0, 0, 0]
    std_np = [0, 0, 0]
    for idx, img_path in enumerate(image_path_list):
        print('{} / {}'.format(idx, n_images), end='\r')
        img_as_img = Image.open(img_path)
        img_as_tensor = to_tensor(img_as_img)
        if minus_point_5:
            img_as_tensor = img_as_tensor - 0.5
        img_as_np = np.array(img_as_img)
        img_as_np = np.rollaxis(img_as_np, 2, 0)
        for c in range(3):
            tmp = (img_as_tensor[c] - mean_tensor[c])**2
            std_tensor[c] += float(torch.sum(tmp))
            tmp = (img_as_np[c] - mean_np[c])**2
            std_np[c] += float(np.sum(tmp))
    std_tensor = [math.sqrt(v / cnt_pixels) for v in std_tensor]
    std_np = [math.sqrt(v / cnt_pixels) for v in std_np]
    print('std_tensor = ', std_tensor)
    print('std_np = ', std_np)

In [7]:
if __name__ == '__main__':
    train_video = par.train_video
    image_path_list = []
    # Calculate RGB means of images in training videos
    for folder in train_video:
        image_path_list += glob.glob(os.path.join(par.image_dir, str(folder), '*.jpg'))
    create_pose_data() # Normalizar los datos de poses
    calculate_rgb_mean_std(image_path_list, minus_point_5=True)

Transforming C:/Users/Omar/Documents/Tarecda/aisg-sla/data//pose_GT/01.txt...
Trajectory 1: shape=(1399, 6)
elapsed time = 0.009997844696044922
Numbers of frames in training dataset: 1399
mean_tensor =  [0.051340546093462204, 0.058663660712425306, 0.036986723011486415]
mean_np =  [140.59183676930283, 142.45923109812955, 136.93161197517873]
std_tensor =  [0.26167191129873923, 0.26125147532781323, 0.30125377150941884]
std_np =  [66.72633676808925, 66.61912628967855, 76.81971117246313]


# Main

In [None]:
import torch
from torch.utils.data import DataLoader
import numpy as np
import os
import time
import pandas as pd
from params import par
from model import DeepVO
from data_helper import get_data_info, SortedRandomBatchSampler, ImageSequenceDataset, get_partition_data_info


# Write all hyperparameters to record_path
mode = 'a' if par.resume else 'w'
with open(par.record_path, mode) as f:
    f.write('\n'+'='*50 + '\n')
    f.write('\n'.join("%s: %s" % item for item in vars(par).items()))
    f.write('\n'+'='*50 + '\n')

# Prepare Data
if os.path.isfile(par.train_data_info_path) and os.path.isfile(par.valid_data_info_path):
    print('Load data info from {}'.format(par.train_data_info_path))
    train_df = pd.read_pickle(par.train_data_info_path)
    valid_df = pd.read_pickle(par.valid_data_info_path)
else:
    print('Create new data info')
    if par.partition != None:
        partition = par.partition
        train_df, valid_df = get_partition_data_info(partition, par.train_video, par.seq_len, overlap=1, sample_times=par.sample_times, shuffle=True, sort=True)
    else:
        train_df = get_data_info(folder_list=par.train_video, seq_len_range=par.seq_len, overlap=1, sample_times=par.sample_times)
        valid_df = get_data_info(folder_list=par.valid_video, seq_len_range=par.seq_len, overlap=1, sample_times=par.sample_times)
    # save the data info
    train_df.to_pickle(par.train_data_info_path)
    valid_df.to_pickle(par.valid_data_info_path)

train_sampler = SortedRandomBatchSampler(train_df, par.batch_size, drop_last=True)
train_dataset = ImageSequenceDataset(train_df, par.resize_mode, (par.img_w, par.img_h), par.img_means, par.img_stds, par.minus_point_5)
train_dl = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=par.n_processors, pin_memory=par.pin_mem)

valid_sampler = SortedRandomBatchSampler(valid_df, par.batch_size, drop_last=True)
valid_dataset = ImageSequenceDataset(valid_df, par.resize_mode, (par.img_w, par.img_h), par.img_means, par.img_stds, par.minus_point_5)
valid_dl = DataLoader(valid_dataset, batch_sampler=valid_sampler, num_workers=par.n_processors, pin_memory=par.pin_mem)

print('Number of samples in training dataset: ', len(train_df.index))
print('Number of samples in validation dataset: ', len(valid_df.index))
print('='*50)


# Model
M_deepvo = DeepVO(par.img_h, par.img_w, par.batch_norm)
use_cuda = torch.cuda.is_available()
if use_cuda:
    print('CUDA used.')
    M_deepvo = M_deepvo.cuda()


# Load FlowNet weights pretrained with FlyingChairs
# NOTE: the pretrained model assumes image rgb values in range [-0.5, 0.5]
if par.pretrained_flownet and not par.resume:
    if use_cuda:
        pretrained_w = torch.load(par.pretrained_flownet)
    else:
        pretrained_w = torch.load(par.pretrained_flownet_flownet, map_location='cpu')
    print('Load FlowNet pretrained model')
    # Use only conv-layer-part of FlowNet as CNN for DeepVO
    model_dict = M_deepvo.state_dict()
    update_dict = {k: v for k, v in pretrained_w['state_dict'].items() if k in model_dict}
    model_dict.update(update_dict)
    M_deepvo.load_state_dict(model_dict)


# Create optimizer
if par.optim['opt'] == 'Adam':
    optimizer = torch.optim.Adam(M_deepvo.parameters(), lr=0.001, betas=(0.9, 0.999))
elif par.optim['opt'] == 'Adagrad':
    optimizer = torch.optim.Adagrad(M_deepvo.parameters(), lr=par.optim['lr'])
elif par.optim['opt'] == 'Cosine':
    optimizer = torch.optim.SGD(M_deepvo.parameters(), lr=par.optim['lr'])
    T_iter = par.optim['T']*len(train_dl)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_iter, eta_min=0, last_epoch=-1)

# Load trained DeepVO model and optimizer
if par.resume:
    M_deepvo.load_state_dict(torch.load(par.load_model_path))
    optimizer.load_state_dict(torch.load(par.load_optimizer_path))
    print('Load model from: ', par.load_model_path)
    print('Load optimizer from: ', par.load_optimizer_path)


# Train
print('Record loss in: ', par.record_path)
min_loss_t = 1e10
min_loss_v = 1e10
M_deepvo.train()
for ep in range(par.epochs):
    st_t = time.time()
    print('='*50)
    # Train
    M_deepvo.train()
    loss_mean = 0
    t_loss_list = []
    for _, t_x, t_y in train_dl:
        if use_cuda:
            t_x = t_x.cuda(non_blocking=par.pin_mem)
            t_y = t_y.cuda(non_blocking=par.pin_mem)
        ls = M_deepvo.step(t_x, t_y, optimizer).data.cpu().numpy()
        t_loss_list.append(float(ls))
        loss_mean += float(ls)
        if par.optim == 'Cosine':
            lr_scheduler.step()
    print('Train take {:.1f} sec'.format(time.time()-st_t))
    loss_mean /= len(train_dl)

    # Validation
    st_t = time.time()
    M_deepvo.eval()
    loss_mean_valid = 0
    v_loss_list = []
    for _, v_x, v_y in valid_dl:
        if use_cuda:
            v_x = v_x.cuda(non_blocking=par.pin_mem)
            v_y = v_y.cuda(non_blocking=par.pin_mem)
        v_ls = M_deepvo.get_loss(v_x, v_y).data.cpu().numpy()
        v_loss_list.append(float(v_ls))
        loss_mean_valid += float(v_ls)
    print('Valid take {:.1f} sec'.format(time.time()-st_t))
    loss_mean_valid /= len(valid_dl)


    f = open(par.record_path, 'a')
    f.write('Epoch {}\ntrain loss mean: {}, std: {:.2f}\nvalid loss mean: {}, std: {:.2f}\n'.format(ep+1, loss_mean, np.std(t_loss_list), loss_mean_valid, np.std(v_loss_list)))
    print('Epoch {}\ntrain loss mean: {}, std: {:.2f}\nvalid loss mean: {}, std: {:.2f}\n'.format(ep+1, loss_mean, np.std(t_loss_list), loss_mean_valid, np.std(v_loss_list)))

    # Save model
    # save if the valid loss decrease
    check_interval = 1
    if loss_mean_valid < min_loss_v and ep % check_interval == 0:
        min_loss_v = loss_mean_valid
        print('Save model at ep {}, mean of valid loss: {}'.format(ep+1, loss_mean_valid))  # use 4.6 sec
        torch.save(M_deepvo.state_dict(), par.save_model_path+'.valid')
        torch.save(optimizer.state_dict(), par.save_optimzer_path+'.valid')
    # save if the training loss decrease
    check_interval = 1
    if loss_mean < min_loss_t and ep % check_interval == 0:
        min_loss_t = loss_mean
        print('Save model at ep {}, mean of train loss: {}'.format(ep+1, loss_mean))
        torch.save(M_deepvo.state_dict(), par.save_model_path+'.train')
        torch.save(optimizer.state_dict(), par.save_optimzer_path+'.train')
    f.close()
    print("fin")