# Data loader
12 users geo-spatial and kinematic features

## Load libraries

In [None]:
# use SWAG
!pip install torchcontrib


Collecting torchcontrib
  Downloading https://files.pythonhosted.org/packages/72/36/45d475035ab35353911e72a03c1c1210eba63b71e5a6917a9e78a046aa10/torchcontrib-0.0.2.tar.gz
Building wheels for collected packages: torchcontrib
  Building wheel for torchcontrib (setup.py) ... [?25l[?25hdone
  Created wheel for torchcontrib: filename=torchcontrib-0.0.2-cp36-none-any.whl size=7531 sha256=addf5de478747c7dcbf9c966c767e5a213a90110ffd3b0dee1e2a8a692d947e1
  Stored in directory: /root/.cache/pip/wheels/06/06/7b/a5f5920bbf4f12a2c927e438fac17d4cd9560f8336b00e9a99
Successfully built torchcontrib
Installing collected packages: torchcontrib
Successfully installed torchcontrib-0.0.2


In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchcontrib.optim import SWA     # use SWAG
from torch.autograd import Variable
from collections import OrderedDict
torch.manual_seed(1) # 设置随机种子，保证可重复性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 设置使用CPU or GPU

from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, auc
from sklearn.metrics import recall_score,precision_score,f1_score

from sklearn.preprocessing import label_binarize
from scipy import interp
from itertools import cycle

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
matplotlib.rcParams['figure.figsize'] = [10, 10] # for square canvas




In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
!pip install GPUtil

def check_GPU():
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0: 
        print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
        print('and then re-execute this cell.')
    else:
        print(gpu_info)

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
        print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
        print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
        print('re-execute this cell.')
    else:
        print('You are using a high-RAM runtime!')

# Import packages

import os,sys,humanize,psutil,GPUtil

# Define function
def mem_report():
  print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ))
  
  GPUs = GPUtil.getGPUs()
  for i, gpu in enumerate(GPUs):
    print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))
    
# Execute function

mem_report()
check_GPU()

Collecting GPUtil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7411 sha256=f42ed613a1f6be4e35b15d82d882de05318255c6f5da84c9504c46e18619aceb
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0
CPU RAM Free: 26.2 GB
GPU 0 ... Mem Free: 16120MB / 16130MB | Utilization   0%
Wed Jan 20 08:43:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persist

#### Check whether the folder is already present before download and unzip

In [None]:
working_path = '/content/drive/My Drive/s182190/data'
try:
  os.chdir(f'{working_path}')
except:
  print('Download and unzip folder')
  !wget https://staticavantipw.s3.eu-west-1.amazonaws.com/DeepLearning2020data/MMM_DataLoaderForStudentsC.zip
  !unzip '/content/MMM_DataLoaderForStudentsC.zip' -d '/content/drive/My Drive/'

# sys.path.append(f'{working_path}/src')
# print(sys.path)
from data_utils_conda import *
clear_output(wait=False)


## Load Data

In [None]:
dataset = pd.read_pickle(f'{working_path}/processed/dataset_merged.pkl')

### Load GPS+GIS fusion tensors - dimensionality 9x9x11 - ( [see explaination here](https://www.sciencedirect.com/science/article/pii/S0968090X20307385?dgcid=author#bfn4))

Remember to delete "image_data" after training, before loading "image_data" with the user_test partition, so that the dataloader can perform the test on your network, with weights and biases resulting from the training.

In [None]:
image_data = pd.read_pickle(f'{working_path}/processed/image_data.pkl')

User 1, image shape (17990, 9, 9, 11)
User 2, image shape (20132, 9, 9, 11)
User 3, image shape (83326, 9, 9, 11)
User 4, image shape (667160, 9, 9, 11)
User 5, image shape (14112, 9, 9, 11)
User 6, image shape (2344, 9, 9, 11)
User 7, image shape (22616, 9, 9, 11)
User 9, image shape (425710, 9, 9, 11)
User 10, image shape (86277, 9, 9, 11)
User 11, image shape (28845, 9, 9, 11)
User 12, image shape (117610, 9, 9, 11)


## Create train validation and test partitions

Since user 4 and 9 have lots of points, We use k parameter to define a specific slice of the train, validation and test collection. If you want to learn more, please go to the data utilis.py. 

For the formal test later of the algorithm, the -random- parameter is set to be True, k is not invoked.

In [None]:
user_train, user_val, user_test = train, val, test = train_test_data_split(dataset.user.unique(),Random=False, k = 9) # use specified splits, check the source code
print(f'partition: train {user_train}, validation {user_val}, test {user_test}')

data_train =  pd.concat([dataset[(dataset.user==tr)] for tr in user_train]).reset_index(drop=True)
data_val = pd.concat([dataset[(dataset.user==va)] for va in user_val]).reset_index(drop=True)
data_test = pd.concat([dataset[(dataset.user==te)] for te in user_test]).reset_index(drop=True)

print(f'length: train {len(data_train)}, validation {len(data_val)}, test {len(data_test)}')

data_train.head(10)

partition: train [4, 1, 8, 12, 11, 7, 10, 5], validation [9], test [3, 2, 6]
length: train 585391, validation 200669, test 62031


Unnamed: 0,user,ts,image_ix,x,y,x_web,y_web,label2,labelP,labelM,delta_t,delta_d,bearing,speed,tod,segment_id,segment_ix,segment_point_count,mode,purpose
0,4,2018-06-15-05,3,1405763.0,7485806.0,1405763.0,7485806.0,2,0.0,5.0,126.0,167.68811,-1.169201,1.330858,0,1,0,1,Light Rail,-100
1,4,2018-06-15-08,3684,1394213.0,7516659.0,1394213.0,7516659.0,1,4.0,0.0,2139.0,6.735206,1.248921,0.003149,1,29,0,2,-100,Education
2,4,2018-06-15-08,3685,1394209.0,7516648.0,1394209.0,7516648.0,1,4.0,0.0,17.0,10.977184,-1.906872,0.645717,1,29,1,2,-100,Education
3,4,2018-06-15-08,3688,1394214.0,7516664.0,1394214.0,7516664.0,1,4.0,0.0,82.0,16.436425,1.248127,0.200444,1,30,0,2,-100,Education
4,4,2018-06-15-08,3689,1394213.0,7516659.0,1394213.0,7516659.0,1,4.0,0.0,41.0,5.115697,-1.864637,0.124773,1,30,1,2,-100,Education
5,4,2018-06-15-08,3692,1394216.0,7516667.0,1394216.0,7516667.0,1,4.0,0.0,103.0,8.163831,1.237253,0.07926,1,31,0,1,-100,Education
6,4,2018-06-15-08,3693,1394215.0,7516665.0,1394215.0,7516665.0,1,4.0,0.0,756.0,2.301488,-1.928052,0.003044,1,32,0,1,-100,Education
7,4,2018-06-15-08,3694,1394213.0,7516659.0,1394213.0,7516659.0,1,4.0,0.0,134.0,6.354567,-1.932308,0.047422,1,33,0,1,-100,Education
8,4,2018-06-15-08,3696,1394216.0,7516667.0,1394216.0,7516667.0,1,4.0,0.0,114.0,8.32573,1.206465,0.073033,1,34,0,1,-100,Education
9,4,2018-06-15-08,3697,1394216.0,7516667.0,1394216.0,7516667.0,1,4.0,0.0,140.0,0.0,0.0,0.0,1,35,0,1,-100,Education


### Define Image Tensor Dataloader

We implement our own Tensor Dataset in order to be able to do fast lookup of sequences and images. Just add features inside
```
df[['feat1','feat2',...]]
```

In [None]:
class ImageTensorDataset(torch.utils.data.Dataset):
    filter_seq = 5
    def __init__(self, df, image_data,filter_seq=filter_seq):
        self.seq = np.stack([np.roll(df[['delta_d', 'bearing']].values, i, axis = 0) for i in range(filter_seq, -1, -1)], axis = 1)
        self.seq = self.seq[df['segment_ix'] >= filter_seq]

        self.label2 = df[df['segment_ix'] >= filter_seq]['label2'].values # need to transfer from 1,2 to 0,1
        self.labelM = df[df['segment_ix'] >= filter_seq]['labelM'].values
        self.labelP = df[df['segment_ix'] >= filter_seq]['labelP'].values
     
        self.user_id = df[df['segment_ix'] >= filter_seq]['user'].values
        self.image_ix = df[df['segment_ix'] >= filter_seq]['image_ix'].values        
        self.image_data = image_data
        tod = df[df['segment_ix'] >= filter_seq]['tod'].values
        self.tod_one_hot = np.eye(5)[tod]
        
    def __len__(self):
        return len(self.label2)
    
    def __getitem__(self, key):
        image = self.image_data[self.user_id[key]][self.image_ix[key]]
        return image, self.seq[key], self.tod_one_hot[key], self.label2[key] - 1, self.labelM[key], self.labelP[key]

demo_dataset = ImageTensorDataset(data_train, image_data)
demo_loader = torch.utils.data.DataLoader(demo_dataset, batch_size=3, shuffle=True)

for X_img, X_seq, X_tod, y1, y2, y3 in demo_loader:
    print('X_img :', X_img.shape)
    print(X_img[0, :, :, 0])
    print('X_seq :', X_seq.shape)
    print(X_seq[0, :])
    print('X_tod :', X_tod.shape)
    print(X_tod[0])
    print('y1:', y2.shape)
    print(y1[0])
    print(y2[0])
    print(y3[0])
    break;

X_img : torch.Size([3, 9, 9, 11])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=torch.float64)
X_seq : torch.Size([3, 6, 2])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]], dtype=torch.float64)
X_tod : torch.Size([3, 5])
tensor([0., 1., 0., 0., 0.], dtype=torch.float64)
y1: torch.Size([3])
tensor(0)
tensor(0., dtype=torch.float64)
tensor(4., dtype=torch.float64)


### define train, val, test set and dataloader

In [None]:
    train_dataset = ImageTensorDataset(data_train, image_data)
    val_dataset = ImageTensorDataset(data_val, image_data)
    test_dataset = ImageTensorDataset(data_test, image_data)