# Notkun gervigreindar fyrir greiningu á þrívíddarmyndum

Nathan Holmes-King

In [28]:
import numpy as np
import pandas as pd
import pywikibot
import sklearn as sk
from sklearn.model_selection import train_test_split
from stl import mesh
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device("cpu")#"mps" if torch.backends.mps.is_available() else "cpu")
torch.set_default_device(device)

## Inngangsorð
Við ætlum að þjálfa gervigreindarlíkan til að greina þrívíddarmyndir. Í notkun eru líkön sem geta það, en þau nota alltaf "bitmap"-myndir. Þetta líkan hér notar "vector"-myndir eins og Envalys.

## Gögn
Þessi gögn eru STL-skrár frá Wikimedia Commons. Það eru fimm flokkar:
- líkamshlutar
- byggingar
- rúmfræði
- geimfarartæki
- styttur

### Sækja gögn

In [29]:
flokkar = ['body parts', 'buildings', 'geometric shapes', 'objects in space', 'sculptures']
skrar = {}
catnum = {}

In [30]:
commons = pywikibot.Site('commons', 'commons')
cn = 0
for a in flokkar:
    print(a)
    cat = pywikibot.Category(commons, 'STL files of ' + a)
    catnum[a] = cn
    cn += 1
    n = 0
    for p in cat.members(member_type=['file']):
        if n % 10 == 0:
            print(n)
        mynd = pywikibot.FilePage(p)
        try:
            tempf = open('/Users/002-nathan/Desktop/Envalys/STLdata/' + a + '_' + p.title()[5:], 'r')
            tempf.close()
        except FileNotFoundError:
            mynd.download(filename='/Users/002-nathan/Desktop/Envalys/STLdata/' + a + '_' + p.title()[5:])
        try:
            skrar[a].append(p.title()[5:])
        except KeyError:
            skrar[a] = [p.title()[5:]]
        n += 1
        if n >= 100:
            break

body parts
0
10
20
30
40
50
60
70
80
buildings
0
10
20
geometric shapes
0
10
20
30
40
objects in space
0
10
20
30
40
50
sculptures
0
10
20
30
40
50


### Setja upp gögn fyrir notkun
Við deilum myndinni í 2.097.152 (128x128x128) þrívíddardíla eða "voxels", teljum punktana í hverjum díl, og notum töluna til að greina myndina.

In [43]:
X_preproc = []
y_preproc = []
for cat in skrar:
    print(cat)
    byrjun = time.time()
    for fi in skrar[cat]:
        # Load data
        gogn = mesh.Mesh.from_file('/Users/002-nathan/Desktop/Envalys/STLdata/' + cat + '_' + fi)
        # Re-scale to be 
        gogn_x = [a[0] for a in gogn.v0]# + [a[0] for a in gogn.v1] + [a[0] for a in gogn.v2]
        gogn_y = [a[1] for a in gogn.v0]# + [a[1] for a in gogn.v1] + [a[1] for a in gogn.v2]
        gogn_z = [a[2] for a in gogn.v0]# + [a[2] for a in gogn.v1] + [a[2] for a in gogn.v2]
        minx = min(gogn_x)
        miny = min(gogn_y)
        minz = min(gogn_z)
        scale = min(128 / (max(gogn_x) + 1e-3 - minx), 
                    128 / (max(gogn_y) + 1e-3 - miny), 
                    128 / (max(gogn_z) + 1e-3 - minz))
        ny_gogn = np.zeros((1, 128, 128, 128), dtype=np.float32)
        for a in gogn.v0:
            x = int((a[0] - minx) * scale)
            y = int((a[1] - miny) * scale)
            z = int((a[2] - minz) * scale)
            ny_gogn[0][x][y][z] += 1
        X_preproc.append(ny_gogn)
        y_preproc.append([catnum[cat]])
    print(time.time() - byrjun)
    print('----')

body parts
45.23072910308838
----
buildings
25.889988899230957
----
geometric shapes
32.926254987716675
----
objects in space
exception (False, 'No lines found, impossible to read')
46.50938415527344
----
sculptures
118.43725895881653
----


In [83]:
X_data = []
y_data = []
for a in X_preproc:
    X_data.append(torch.from_numpy(a).to(device))
for a in y_preproc:
    y_data.append(torch.tensor(a).to(device))

## Líkan

In [84]:
class likan(nn.Module):
    def __init__(self):
        super(likan, self).__init__()
        self.conv1 = nn.Conv3d(1, 16, 5, padding=2)
        self.conv2 = nn.Conv3d(16, 32, 5, padding=2)
        self.conv3 = nn.Conv3d(32, 64, 5, padding=2)
        self.fc1 = nn.Linear(64*16*16*16, 1024)
        self.fc2 = nn.Linear(1024, 128)
        self.fc3 = nn.Linear(128, 5)  # Change second param
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool3d(x, 2)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool3d(x, 2)
        x = self.conv3(x)
        x = F.relu(x)
        x = F.max_pool3d(x, 2)
        x = x.view(-1, 64*16*16*16)
        x = F.relu(self.fc1(x))
        x = F.dropout(x)
        x = F.relu(self.fc2(x))
        x = F.dropout(x)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

In [85]:
num_epochs = 3
learning_rate = 0.001
momentum = 0.5

In [87]:
byrjun = time.time()
acc = {}
for i in range(5):
    print('Random state:', i)
    X_data = []
    y_data = []
    for a in X_preproc:
        X_data.append(torch.from_numpy(a).to(device))
    for a in y_preproc:
        y_data.append(torch.tensor(a).to(device))
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=i)
    for e in range(num_epochs):
        print('Epoch:', e)
        model = likan()
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
        for n in range(len(X_train)):
            optimizer.zero_grad()
            output = model(X_train[n])
            loss = F.nll_loss(output, y_train[n])
            loss.backward()
            optimizer.step()
        trainmat = np.zeros((5, 5), dtype=np.int32)  # Change params
        correct = 0
        with torch.no_grad():
            for n in range(len(X_train)):
                output = model(X_train[n])
                y_pred = output.data.max(1, keepdim=True)[1]
                correct += y_pred.eq(y_train[n]).sum().item()
                trainmat[y_train[n].item()][y_pred.item()] += 1
        print('Train accuracy:', correct / len(y_train))
        print(trainmat)
        testmat = np.zeros((5, 5), dtype=np.int32)  # Change params
        correct = 0
        with torch.no_grad():
            for n in range(len(X_test)):
                output = model(X_test[n])
                y_pred = output.data.max(1, keepdim=True)[1]
                correct += y_pred.eq(y_test[n]).sum().item()
                testmat[y_test[n].item()][y_pred.item()] += 1
        print('Test accuracy:', correct / len(y_test))
        print(testmat)
        try:
            acc[e].append(correct / len(y_test))
        except KeyError:
            acc[e] = [correct / len(y_test)]
        print('Time:', time.time() - byrjun)
        print('----')
    print('----')
for a in acc:
    print('Epoch', a, 'mean test accuracy:', np.mean(acc[a]))

Random state: 0
Epoch: 0
Train accuracy: 0.3717277486910995
[[60  0  3  1  0]
 [15  0  1  2  1]
 [30  0  2  1  0]
 [21  0  2  9  0]
 [41  0  2  0  0]]
Test accuracy: 0.3125
[[18  0  0  0  0]
 [ 5  0  0  0  0]
 [12  0  0  0  0]
 [16  0  0  2  1]
 [ 9  0  1  0  0]]
Time: 783.6924901008606
----
Epoch: 1
Train accuracy: 0.38219895287958117
[[50  0  3  0 11]
 [11  3  2  0  3]
 [26  0  4  0  3]
 [14  4  5  4  5]
 [24  0  7  0 12]]
Test accuracy: 0.34375
[[15  0  1  0  2]
 [ 2  0  1  0  2]
 [ 9  0  2  0  1]
 [10  1  6  1  1]
 [ 5  0  1  0  4]]
Time: 1526.0018949508667
----
Epoch: 2
Train accuracy: 0.24607329842931938
[[ 3 34  0  0 27]
 [ 1 12  0  0  6]
 [ 0 16  1  0 16]
 [ 0 21  1  0 10]
 [ 0 12  0  0 31]]
Test accuracy: 0.078125
[[ 0  9  0  0  9]
 [ 0  0  0  0  5]
 [ 0  8  0  0  4]
 [ 0 11  0  0  8]
 [ 0  5  0  0  5]]
Time: 2230.2684848308563
----
----
Random state: 1
Epoch: 0
Train accuracy: 0.32460732984293195
[[57  0  3  0  0]
 [15  0  4  0  0]
 [32  0  2  0  0]
 [19  0 16  0  2]
 [17  0 

Fyrir fjögur úr fimm líkön eru niðurstöður bestar eftir 2 lotur. Líkönin spá rétta flokkinn fyrir á milli 30% og 42% myndanna (meðaltalið er 35%).

## Lokaorð

Hvernig notum við líkan eins og þetta?
- Til að greina þrívíddarmyndir sem notendur teikna.
- Sem fyrsta skref í stærra líkani sem teiknir sjálft eftir texta sem notendur skrifa.

Vandamál:
- **Ekki nóg gögn.** Þess vegna eru svo margar sveiflur í spánum milli líkana. Alvörulíkön nota fleiri en 10.000 mynda til að þjálfast.
- Ekki nógir flokkar. Allir flokkar, sérstaklega "líkamshlutar", eru mjög fjölbreyttir og alls ekki eins fyrir tölvuna.
- Við notum aðeins punkta, ekki línur eða flatir, til að greina.

En þetta er það besta fyrir gögnin sem ég var með. Og það tók nóg langan tíma til að þjálfa líkanið; þess vegna þurfum við betri tölvu.