## Data Cleaning from json files



In [1]:
import os
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import time

pd.options.display.max_rows = 4000

 

In [2]:
class PoseDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, x,y):
        """
        Args:
            root_dir (string): Directory with all the images.
        """
        self.x  = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        X = torch.FloatTensor(self.x.iloc[idx])
        Y = torch.from_numpy(np.array(self.y.iloc[idx])).type(torch.LongTensor)

        sample = (X,Y)

        return sample

In [None]:
torch.FloatTensor(y_train.loc[[0,3]])

tensor([0., 0.])

In [62]:
BATCH_SIZE=32

csv_file_path = "/content/drive/Othercomputers/My PC/cs7643-project/output/pose_df.csv"
pose_data = pd.read_csv(csv_file_path,index_col=0)
#[27,33] only have one sample each
pose_data = pose_data[pose_data["Y"] != 27]
pose_data = pose_data[pose_data["Y"] != 33]

#take only the sizes greater than 200
pose_data["size"] = pose_data.groupby('Y').Y.transform('size')
pose_data = pose_data[pose_data['size'] > 300]
pose_data = pose_data.drop(["size"],axis=1)
pose_data = pose_data.reset_index(drop=True)
n_classes = len(pose_data["Y"].unique())
class_remap = {v:i for i,v in enumerate(pose_data["Y"].unique())}
pose_data["Y"] = pose_data["Y"].map(class_remap)



X_train, X_test, y_train, y_test = train_test_split(pose_data.iloc[:,0:-1], pose_data.iloc[:,-1], test_size=0.2, random_state=42,stratify=pose_data.iloc[:,-1])
# 
train_dataset = PoseDataset(x=X_train,y=y_train)
test_dataset = PoseDataset(X_test,y_test)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=100,shuffle=False, num_workers=2)

In [63]:
len(class_remap)

5

## The DNN

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [57]:

class DNN(nn.Module):

    def __init__(self,numClasses = 80,features=75,hidden_dim=128):
        super(DNN, self).__init__()

        self.fc1 = nn.Linear(features,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,numClasses)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.sigmoid(x)
        x = self.fc2(x)

        out = x
        return out


## Training and testing

In [58]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def accuracy(output, target):
    """Computes the precision@k for the specified values of k"""
    batch_size = target.shape[0]

    _, pred = torch.max(output, dim=-1)

    correct = pred.eq(target).sum() * 1.0

    acc = correct / batch_size

    return acc

def train(epoch, data_loader, model, optimizer, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    for idx, (data, target) in enumerate(data_loader):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()

        #############################################################################
        # TODO: Complete the body of training loop                                  #
        #       1. forward data batch to the model                                  #
        #       2. Compute batch loss                                               #
        #       3. Compute gradients and update model parameters                    #
        #############################################################################
        model.train()
        out = model(data)
        loss = criterion(out, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #############################################################################
        #                              END OF YOUR CODE                             #
        #############################################################################

        batch_acc = accuracy(out, target)

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)
        if idx % 100 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t'
                   'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                   'Prec @1 {top1.val:.4f} ({top1.avg:.4f})\t')
                  .format(epoch, idx, len(data_loader), iter_time=iter_time, loss=losses, top1=acc))
            
def validate(epoch, val_loader, model, criterion):
    iter_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    num_class = 10
    cm = torch.zeros(num_class, num_class)
    # evaluation loop
    for idx, (data, target) in enumerate(val_loader):
        start = time.time()

        if torch.cuda.is_available():
            data = data.cuda()
            target = target.cuda()
        #############################################################################
        # TODO: Complete the body of training loop                                  #
        #       HINT: torch.no_grad()                                               #
        #############################################################################
        model.eval()
        with torch.no_grad():
            out = model(data)
            loss = criterion(out, target)

        #############################################################################
        #                              END OF YOUR CODE                             #
        #############################################################################

        batch_acc = accuracy(out, target)

        # update confusion matrix
        _, preds = torch.max(out, 1)
        # for t, p in zip(target.view(-1), preds.view(-1)):
        #     cm[t.long(), p.long()] += 1

        losses.update(loss, out.shape[0])
        acc.update(batch_acc, out.shape[0])

        iter_time.update(time.time() - start)
        if idx % 100 == 0:
            print(('Epoch: [{0}][{1}/{2}]\t'
                   'Time {iter_time.val:.3f} ({iter_time.avg:.3f})\t')
                  .format(epoch, idx, len(val_loader), iter_time=iter_time, loss=losses, top1=acc))
    # cm = cm / cm.sum(1)
    # per_cls_acc = cm.diag().detach().numpy().tolist()
    # for i, acc_i in enumerate(per_cls_acc):
    #     print("Accuracy of Class {}: {:.4f}".format(i, acc_i))

    print("* Prec @1: {top1.avg:.4f}".format(top1=acc))
    return acc.avg, cm


# def adjust_learning_rate(optimizer, epoch, args):
#     epoch += 1
#     if epoch <= args.warmup:
#         lr = args.learning_rate * epoch / args.warmup
#     elif epoch > args.steps[1]:
#         lr = args.learning_rate * 0.01
#     elif epoch > args.steps[0]:
#         lr = args.learning_rate * 0.1
#     else:
#         lr = args.learning_rate
#     for param_group in optimizer.param_groups:
#         param_group['lr'] = lr

In [64]:
import torch.optim as optim

model = DNN(numClasses=n_classes,hidden_dim=256)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)


for epoch in range(1000):  # loop over the dataset multiple times

    # train loop
    train(epoch, train_dataloader, model, optimizer, criterion)

    # validation loop
    acc, cm = validate(epoch, test_dataloader, model, criterion)


print('Finished Training')

Epoch: [0][0/48]	Time 0.011 (0.011)	Loss 1.7108 (1.7108)	Prec @1 0.2812 (0.2812)	
Epoch: [0][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.2637
Epoch: [1][0/48]	Time 0.009 (0.009)	Loss 1.5379 (1.5379)	Prec @1 0.3125 (0.3125)	
Epoch: [1][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.3499
Epoch: [2][0/48]	Time 0.004 (0.004)	Loss 1.5292 (1.5292)	Prec @1 0.4375 (0.4375)	
Epoch: [2][0/4]	Time 0.003 (0.003)	
* Prec @1: 0.4282
Epoch: [3][0/48]	Time 0.004 (0.004)	Loss 1.4507 (1.4507)	Prec @1 0.5000 (0.5000)	
Epoch: [3][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.4648
Epoch: [4][0/48]	Time 0.010 (0.010)	Loss 1.4159 (1.4159)	Prec @1 0.4688 (0.4688)	
Epoch: [4][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.5091
Epoch: [5][0/48]	Time 0.004 (0.004)	Loss 1.3022 (1.3022)	Prec @1 0.6562 (0.6562)	
Epoch: [5][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.5117
Epoch: [6][0/48]	Time 0.008 (0.008)	Loss 1.3930 (1.3930)	Prec @1 0.6250 (0.6250)	
Epoch: [6][0/4]	Time 0.002 (0.002)	
* Prec @1: 0.5196
Epoch: [7][0/48]	Time 0.009 (0.009)	Loss 1.3306 

In [None]:
y_train.loc[[0]].values

array([0.])

## Testing


In [None]:
pose_data.groupby(["Y"]).size()

Y
0.0      82
1.0     371
2.0     120
3.0     129
4.0     271
5.0       8
6.0     123
7.0     325
8.0      60
9.0     485
10.0    139
11.0    213
12.0    167
13.0     38
14.0     60
15.0     57
16.0    126
17.0     47
18.0     56
19.0      9
20.0     35
21.0    264
22.0     18
23.0    113
24.0     31
25.0    156
26.0     50
28.0     49
29.0    239
30.0     86
31.0     35
32.0      7
34.0    138
35.0    217
36.0    145
37.0    387
38.0     11
39.0      8
40.0     43
41.0     48
42.0     69
43.0    227
44.0     43
45.0    239
46.0      7
47.0    215
48.0    137
49.0    200
50.0    300
51.0     97
52.0     41
53.0    135
54.0     30
55.0      8
56.0     88
57.0      6
58.0    134
59.0     26
60.0     62
61.0     41
62.0     95
63.0    346
64.0    298
65.0     67
66.0     42
67.0     17
68.0     45
69.0    216
70.0     36
71.0    229
72.0     25
73.0     18
74.0     49
75.0    204
76.0     75
77.0    125
78.0      8
79.0     82
dtype: int64

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    images, labels = X_test,y_test
    # calculate outputs by running images through the network
    outputs = model(images.float())
    # the class with the highest energy is what we choose as prediction
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 13 %


In [None]:
outputs

tensor([[ 0.1674, -0.9808, -0.2520,  ...,  2.2150, -0.5100, -3.3449],
        [-0.2813,  0.1347,  1.4681,  ...,  5.3942, -0.7866, -4.8951],
        [ 1.6224,  0.7240,  0.3740,  ...,  2.8399,  0.6893, -5.0767],
        ...,
        [-1.2346, -4.2762, -2.8379,  ..., -4.0123, -1.7167,  1.4468],
        [-0.3394, -1.2395,  0.6135,  ...,  0.0387, -1.3142, -2.5738],
        [-2.1884, -2.6262, -2.8384,  ..., -1.1630, -0.8836,  2.0409]])

In [None]:
from time import sleep
while True:
    sleep(10)
