# Notkun gervigreindar fyrir greiningu á þrívíddarmyndum

Nathan Holmes-King

In [143]:
import numpy as np
import pandas as pd
import pywikibot
import random
import sklearn as sk
from sklearn.model_selection import train_test_split
from stl import mesh
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
torch.set_default_device(device)

## Inngangsorð
Við ætlum að þjálfa gervigreindarlíkan til að greina þrívíddarmyndir. Í notkun eru líkön sem geta það, en þau nota alltaf "bitmap"-myndir. Þetta líkan hér notar "vector"-myndir eins og Envalys.

## Gögn
Þessi gögn eru STL-skrár frá Wikimedia Commons. Það eru fimm flokkar:
- líkamshlutar
- byggingar
- rúmfræði
- geimfarartæki
- styttur

### Sækja gögn

In [138]:
flokkar = ['body parts', 'buildings', 'geometric shapes', 'objects in space', 'sculptures']
skrar = {}
catnum = {}

In [139]:
commons = pywikibot.Site('commons', 'commons')
cn = 0
for a in flokkar:
    print(a)
    cat = pywikibot.Category(commons, 'STL files of ' + a)
    catnum[a] = cn
    cn += 1
    n = 0
    for p in cat.members(member_type=['file']):
        if n % 10 == 0:
            print(n)
        mynd = pywikibot.FilePage(p)
        try:
            tempf = open('/Users/002-nathan/Desktop/Envalys/STLdata/' + a + '_' + p.title()[5:], 'r')
            tempf.close()
        except FileNotFoundError:
            mynd.download(filename='/Users/002-nathan/Desktop/Envalys/STLdata/' + a + '_' + p.title()[5:])
        try:
            skrar[a].append(p.title()[5:])
        except KeyError:
            skrar[a] = [p.title()[5:]]
        n += 1
        if n >= 100:
            break

body parts
0
10
20
30
40
50
60
70
80
buildings
0
10
20
geometric shapes
0
10
20
30
40
objects in space
0
10
20
30
40
50
sculptures
0
10
20
30
40
50


### Setja upp gögn fyrir notkun
Við búum til greypingu ("embedding") fyrir punktana.

In [201]:
# From https://github.com/1zb/3DShape2VecSet/blob/master/models_ae.py
class PointEmbed(nn.Module):
    def __init__(self, hidden_dim=48, dim=128):
        super().__init__()

        assert hidden_dim % 6 == 0

        self.embedding_dim = hidden_dim
        e = torch.pow(2, torch.arange(self.embedding_dim // 6)).float() * np.pi
        e = torch.stack([
            torch.cat([e, torch.zeros(self.embedding_dim // 6),
                        torch.zeros(self.embedding_dim // 6)]),
            torch.cat([torch.zeros(self.embedding_dim // 6), e,
                        torch.zeros(self.embedding_dim // 6)]),
            torch.cat([torch.zeros(self.embedding_dim // 6),
                        torch.zeros(self.embedding_dim // 6), e]),
        ])
        self.register_buffer('basis', e)  # 3 x 16

        self.mlp = nn.Linear(self.embedding_dim+3, dim)

    @staticmethod
    def embed(input, basis):
        projections = torch.einsum(
            'bnd,de->bne', input, basis)
        embeddings = torch.cat([projections.sin(), projections.cos()], dim=2)
        return embeddings
    
    def forward(self, input):
        # input: B x N x 3
        embed = self.mlp(torch.cat([self.embed(input, self.basis), input], dim=2)) # B x N x C
        return embed

In [202]:
X_preproc = []
y_preproc = []
for cat in skrar:
    print(cat)
    byrjun = time.time()
    for fi in skrar[cat]:
        # Load data
        gogn = mesh.Mesh.from_file('/Users/002-nathan/Desktop/Envalys/STLdata/' + cat + '_' + fi)
        inp = []
        for i in range(256):
            inp.append(gogn.v0[random.randint(0, len(gogn.v0) - 1)])
        X_preproc.append([inp])
        y_preproc.append([catnum[cat]])
    print(time.time() - byrjun)
    print('----')

body parts
1.6972789764404297
----
buildings
0.9287159442901611
----
geometric shapes
1.2056632041931152
----
objects in space
exception (False, 'No lines found, impossible to read')
1.7605020999908447
----
sculptures
5.893723964691162
----


## Líkan

In [203]:
class likan(nn.Module):
    def __init__(self):
        super(likan, self).__init__()
        self.fc1 = nn.Linear(256*128, 256)
        self.fc2 = nn.Linear(256, 32)
        self.fc3 = nn.Linear(32, 5)  # Change second param
    def forward(self, x):
        x = x.view(-1, 256*128)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

In [239]:
num_epochs = 5
learning_rate = 0.001
momentum = 0.001

In [241]:
byrjun = time.time()
acc = {}
for i in range(5):
    print('Random state:', i)
    X_data = []
    y_data = []
    point_embed = PointEmbed()
    for a in X_preproc:
        greyping = point_embed(torch.tensor(a))
        X_data.append(greyping.to(device))
    for a in y_preproc:
        y_data.append(torch.tensor(a).to(device))
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=i)
    model = likan()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
    for e in range(num_epochs):
        print('Epoch:', e)
        for n in range(len(X_train)):
            optimizer.zero_grad()
            output = model(X_train[n])
            loss = F.nll_loss(output, y_train[n])
            torch.mps.synchronize()
            loss.backward(retain_graph=True)
            optimizer.step()
        if torch.isnan(loss).any():
            print('NaN!')
        trainmat = np.zeros((5, 5), dtype=np.int32)  # Change params
        correct = 0
        with torch.no_grad():
            for n in range(len(X_train)):
                output = model(X_train[n])
                y_pred = output.data.max(1, keepdim=True)[1]
                correct += y_pred.eq(y_train[n]).sum().item()
                trainmat[y_train[n].item()][y_pred.item()] += 1
        print('Train accuracy:', correct / len(y_train))
        print(trainmat)
        testmat = np.zeros((5, 5), dtype=np.int32)  # Change params
        correct = 0
        with torch.no_grad():
            for n in range(len(X_test)):
                output = model(X_test[n])
                y_pred = output.data.max(1, keepdim=True)[1]
                correct += y_pred.eq(y_test[n]).sum().item()
                testmat[y_test[n].item()][y_pred.item()] += 1
        print('Test accuracy:', correct / len(y_test))
        print(testmat)
        try:
            acc[e].append(correct / len(y_test))
        except KeyError:
            acc[e] = [correct / len(y_test)]
        print('Time:', time.time() - byrjun)
        print('----')
    print('----')
for a in acc:
    print('Epoch', a, 'mean test accuracy:', np.mean(acc[a]))

Random state: 0
Epoch: 0
Train accuracy: 0.24479166666666666
[[18  0  0  9 37]
 [ 8  1  0  0 10]
 [17  2  0  0 14]
 [10  1  0  3 18]
 [19  0  0  0 25]]
Test accuracy: 0.234375
[[ 8  1  0  2  7]
 [ 1  0  0  0  4]
 [ 4  1  0  2  5]
 [ 4  0  0  0 15]
 [ 3  0  0  0  7]]
Time: 4.262437105178833
----
Epoch: 1
Train accuracy: 0.3802083333333333
[[30  0  0  3 31]
 [ 0  2  0  0 17]
 [ 3  1  0  3 26]
 [ 3  2  0  1 26]
 [ 3  1  0  0 40]]
Test accuracy: 0.25
[[ 5  0  0  3 10]
 [ 0  1  0  0  4]
 [ 0  0  0  0 12]
 [ 0  1  0  0 18]
 [ 0  0  0  0 10]]
Time: 6.239622116088867
----
Epoch: 2
Train accuracy: 0.3333333333333333
[[32  0  0  0 32]
 [ 2  3  0  0 14]
 [18  1  0  0 14]
 [10  1  0  1 20]
 [15  1  0  0 28]]
Test accuracy: 0.3125
[[10  0  0  1  7]
 [ 0  1  0  0  4]
 [ 6  0  0  0  6]
 [ 2  1  0  0 16]
 [ 1  0  0  0  9]]
Time: 8.172873973846436
----
Epoch: 3
Train accuracy: 0.2864583333333333
[[13  0  0 14 37]
 [ 0  3  0  0 16]
 [ 6  0  0  0 27]
 [ 3  1  0  1 27]
 [ 5  1  0  0 38]]
Test accuracy: 0.

CRITICAL: Exiting due to uncaught exception KeyboardInterrupt: 


Líkönin spá rétta flokkinn fyrir 24% myndanna. Það er verra en fyrsti hátturinn, en þjálfunin er 100 sinnum fljótari; ég get bætt líkönin mjög betur.

## Lokaorð

Hvernig notum við líkan eins og þetta?
- Til að greina þrívíddarmyndir sem notendur teikna.
- Sem fyrsta skref í stærra líkani sem teiknir sjálft eftir texta sem notendur skrifa.

Vandamál:
- **Ekki nóg gögn.** Þess vegna eru svo margar sveiflur í spánum milli líkana. Alvörulíkön nota fleiri en 10.000 mynda til að þjálfast.
- Ekki nógir flokkar. Allir flokkar, sérstaklega "líkamshlutar", eru mjög fjölbreyttir og alls ekki eins fyrir tölvuna.
- Við notum aðeins punkta, ekki línur eða flatir, til að greina.
- Tekur of langan tíma. Ég nota CPU, sem er mjög hægara en GPU, af því að GPU á Mac getur ekki gert ```nn.conv3d()```, og Google Colab er of dýrt.