## Data Cleaning from json files



In [4]:
from classes.PoseDataset import PoseDataset
from models.DNN import DNN
import torch.optim as optim
from sklearn.model_selection import train_test_split
import os
import json
import numpy as np
import pandas as pd
from collections import defaultdict
import torch
from torch.utils.data import DataLoader
import time

pd.options.display.max_rows = 4000




## Training and testing

In [5]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.shape[0]

    _, pred = torch.max(output, dim=-1)

    correct = pred.eq(target).sum() * 1.0

    acc = correct / batch_size

    return acc

def train(epoch, data_loader, model, optimizer, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    for idx, (data, target) in enumerate(data_loader):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        #############################################################################
        # TODO: Complete the body of training loop                                  #
        #       1. forward data batch to the model                                  #
        #       2. Compute batch loss                                               #
        #       3. Compute gradients and update model parameters                    #
        #############################################################################
        model.train()
        out = model(data)
        loss = criterion(out, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #############################################################################
        #                              END OF YOUR CODE                             #
        #############################################################################

        batch_acc = accuracy(out, target)

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)
        if idx % 100 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec @1 {top1.val:.4f} ({top1.avg:.4f})\t')
                  .format(epoch, idx, len(data_loader), iter_time=iter_time, loss=losses, top1=acc))
            
def validate(epoch, val_loader, model, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    num_class = 10
    cm = torch.zeros(num_class, num_class)
    # evaluation loop
    for idx, (data, target) in enumerate(val_loader):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        #############################################################################
        # TODO: Complete the body of training loop                                  #
        #       HINT: torch.no_grad()                                               #
        #############################################################################
        model.eval()
        with torch.no_grad():
            out = model(data)
            loss = criterion(out, target)

        #############################################################################
        #                              END OF YOUR CODE                             #
        #############################################################################

        batch_acc = accuracy(out, target)

        # update confusion matrix
        _, preds = torch.max(out, 1)
        # for t, p in zip(target.view(-1), preds.view(-1)):
        #     cm[t.long(), p.long()] += 1

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)
        if idx % 100 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t')
                  .format(epoch, idx, len(val_loader), iter_time=iter_time, loss=losses, top1=acc))
    # cm = cm / cm.sum(1)
    # per_cls_acc = cm.diag().detach().numpy().tolist()
    # for i, acc_i in enumerate(per_cls_acc):
    #     print("Accuracy of Class {}: {:.4f}".format(i, acc_i))

    print("* Prec @1: {top1.avg:.4f}".format(top1=acc))
    return acc.avg, cm


In [7]:
BATCH_SIZE=32

csv_file_path = "./output/pose_df.csv"
pose_data = pd.read_csv(csv_file_path,index_col=0)
raw_pose_data = pose_data
#[27,33] only have one sample each
pose_data = pose_data[pose_data["Y"] != 27]
pose_data = pose_data[pose_data["Y"] != 33]

#take only the sizes greater than 200
pose_data["size"] = pose_data.groupby('Y').Y.transform('size')
pose_data = pose_data[pose_data['size'] > 260]
pose_data = pose_data.drop(["size"],axis=1)
pose_data = pose_data.reset_index(drop=True)
n_classes = len(pose_data["Y"].unique())
class_remap = {v:i for i,v in enumerate(pose_data["Y"].unique())}
pose_data["Y"] = pose_data["Y"].map(class_remap)
pose_remap = {d['Y']:d["pose"] for d in pose_data[["Y",'pose']].to_dict("records")}
pose_data = pose_data.drop(["pose"],axis=1)



X_train, X_test, y_train, y_test = train_test_split(pose_data.iloc[:,0:-1], pose_data.iloc[:,-1], test_size=0.2, random_state=42,stratify=pose_data.iloc[:,-1])
# 
train_dataset = PoseDataset(x=X_train,y=y_train)
test_dataset = PoseDataset(X_test,y_test)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=100,shuffle=False, num_workers=2)

In [47]:
model = DNN(numClasses=n_classes,hidden_dim=256)
if torch.cuda.is_available():
    model = model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


for epoch in range(1000):  # loop over the dataset multiple times

    # train loop
    train(epoch, train_dataloader, model, optimizer, criterion)

    # validation loop
    acc, cm = validate(epoch, test_dataloader, model, criterion)


print('Finished Training')

Epoch: [0][0/77]	Time 0.116 (0.116)	Loss 2.4092 (2.4092)	Prec @1 0.0625 (0.0625)	
Epoch: [0][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.4066
Epoch: [1][0/77]	Time 0.006 (0.006)	Loss 1.9781 (1.9781)	Prec @1 0.3750 (0.3750)	
Epoch: [1][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.4770
Epoch: [2][0/77]	Time 0.004 (0.004)	Loss 1.7875 (1.7875)	Prec @1 0.5312 (0.5312)	
Epoch: [2][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.5377
Epoch: [3][0/77]	Time 0.008 (0.008)	Loss 1.6844 (1.6844)	Prec @1 0.6250 (0.6250)	
Epoch: [3][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.5262
Epoch: [4][0/77]	Time 0.004 (0.004)	Loss 1.7493 (1.7493)	Prec @1 0.4688 (0.4688)	
Epoch: [4][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.5787
Epoch: [5][0/77]	Time 0.004 (0.004)	Loss 1.7963 (1.7963)	Prec @1 0.4375 (0.4375)	
Epoch: [5][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.6098
Epoch: [6][0/77]	Time 0.004 (0.004)	Loss 1.3887 (1.3887)	Prec @1 0.6250 (0.6250)	
Epoch: [6][0/7]	Time 0.002 (0.002)	
* Prec @1: 0.5984
Epoch: [7][0/77]	Time 0.004 (0.004)	Loss 1.0905 

In [None]:
y_train.loc[[0]].values

array([0.])

## Testing


In [None]:
pose_data.groupby(["Y"]).size()

Y
0.0      82
1.0     371
2.0     120
3.0     129
4.0     271
5.0       8
6.0     123
7.0     325
8.0      60
9.0     485
10.0    139
11.0    213
12.0    167
13.0     38
14.0     60
15.0     57
16.0    126
17.0     47
18.0     56
19.0      9
20.0     35
21.0    264
22.0     18
23.0    113
24.0     31
25.0    156
26.0     50
28.0     49
29.0    239
30.0     86
31.0     35
32.0      7
34.0    138
35.0    217
36.0    145
37.0    387
38.0     11
39.0      8
40.0     43
41.0     48
42.0     69
43.0    227
44.0     43
45.0    239
46.0      7
47.0    215
48.0    137
49.0    200
50.0    300
51.0     97
52.0     41
53.0    135
54.0     30
55.0      8
56.0     88
57.0      6
58.0    134
59.0     26
60.0     62
61.0     41
62.0     95
63.0    346
64.0    298
65.0     67
66.0     42
67.0     17
68.0     45
69.0    216
70.0     36
71.0    229
72.0     25
73.0     18
74.0     49
75.0    204
76.0     75
77.0    125
78.0      8
79.0     82
dtype: int64

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    images, labels = X_test,y_test
    # calculate outputs by running images through the network
    outputs = model(images.float())
    # the class with the highest energy is what we choose as prediction
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 13 %


In [None]:
outputs

tensor([[ 0.1674, -0.9808, -0.2520,  ...,  2.2150, -0.5100, -3.3449],
        [-0.2813,  0.1347,  1.4681,  ...,  5.3942, -0.7866, -4.8951],
        [ 1.6224,  0.7240,  0.3740,  ...,  2.8399,  0.6893, -5.0767],
        ...,
        [-1.2346, -4.2762, -2.8379,  ..., -4.0123, -1.7167,  1.4468],
        [-0.3394, -1.2395,  0.6135,  ...,  0.0387, -1.3142, -2.5738],
        [-2.1884, -2.6262, -2.8384,  ..., -1.1630, -0.8836,  2.0409]])

In [None]:
from time import sleep
while True:
    sleep(10)
