In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz


In [3]:
import os
import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

Number of directories: 20
Directory names: ['comp.windows.x', 'rec.sport.hockey', 'rec.sport.baseball', 'comp.graphics', 'rec.autos', 'talk.politics.misc', 'talk.religion.misc', 'comp.os.ms-windows.misc', 'misc.forsale', 'talk.politics.mideast', 'alt.atheism', 'sci.med', 'sci.electronics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'sci.crypt', 'sci.space', 'talk.politics.guns', 'soc.religion.christian', 'rec.motorcycles']
Number of files in comp.graphics: 1000
Some example filenames: ['38961', '38561', '38506', '38790', '39623']


In [4]:
print(open(data_dir / "comp.graphics" / "38987").read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



In [5]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [6]:
# Shuffle the data
seed = 12345
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))

train_samples = samples[:-num_validation_samples]
val_samples   = samples[-num_validation_samples:]
train_labels  = labels[:-num_validation_samples]
val_labels    = labels[-num_validation_samples:]

In [7]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [8]:
vectorizer.get_vocabulary()[:10]

['', '[UNK]', 'the', 'to', 'of', 'a', 'and', 'in', 'is', 'i']

In [9]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   2, 3762, 1723,   15,    2, 5624])

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [11]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

[2, 3762, 1723, 15, 2, 5624]

In [12]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [13]:
x_train[0]

array([ 2496,  1609,  2650,  1960,   181,   420,  1307,  1747,   492,
         266,   130,   690,   242,   156,  9739,     1,  1699,   800,
           5,  8000,  9733,  7164,  4604,  8306, 19592,  3240,  1033,
         877,    64,   417,   106,   417,   106,   106,   106,   106,
         135,    32,  2266,   229,   336,  2097,  9105,    79,   245,
         155,   114,   135,   106,    79,   106,   106,   506,  2831,
        6047,   314,  3373,  6072,   114,   265,   203,    64,    79,
         106,   106,   106,   203,  9072,  2713,  6047,  2620,  3793,
        6726,   114,   265,   135,   114,    64,    64,   106,   106,
         135,    32,  2274,   229,  2220,  3455,  3911,   114,   265,
         114,   135,    79,    64,   106,   106,   601,    32,  2013,
        6047,   449,     1,  7568,   114,   265,    79,   155,    64,
         106,   106,   106,   336,    32,  2325,   229,   496,  1476,
        4515,   114,   265,    64,   203,   106,   106,   106,   106,
         106,    32,

In [14]:
# tf.data.Dataset

def create_dataset(x, y):
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(100)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


train_ds = create_dataset(x_train, y_train)
test_ds = create_dataset(x_val, y_val)

In [15]:
for x, y in train_ds:
    print(x[0])
    print(y[0])
    break

tf.Tensor(
[ 2496  1609  2650  1960   181   420  1307  1747   492   266   130   690
   242   156  9739     1  1699   800     5  8000  9733  7164  4604  8306
 19592  3240  1033   877    64   417   106   417   106   106   106   106
   135    32  2266   229   336  2097  9105    79   245   155   114   135
   106    79   106   106   506  2831  6047   314  3373  6072   114   265
   203    64    79   106   106   106   203  9072  2713  6047  2620  3793
  6726   114   265   135   114    64    64   106   106   135    32  2274
   229  2220  3455  3911   114   265   114   135    79    64   106   106
   601    32  2013  6047   449     1  7568   114   265    79   155    64
   106   106   106   336    32  2325   229   496  1476  4515   114   265
    64   203   106   106   106   106   106    32  2255   464  3147   507
  8072   114   265    64   203   106   106   106   106   106   506  2713
   229   314  1344  6328   417   203   135    79   106   106    79   106
   135    32  2651   229   242  1496 106

In [16]:
# training

from tensorflow.keras import layers

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [17]:
# Two seperate embedding layers, one for tokens, one for token index (positions)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [23]:
from tensorflow import keras

vocab_size   = 20000
sequence_len = 200

embed_dim = 128  # Embedding size for each token
num_heads = 6    # Number of attention heads
ff_dim = 128     # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(sequence_len,))
embedding_layer = TokenAndPositionEmbedding(sequence_len, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(20, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 200)]             0         
                                                                 
 token_and_position_embeddin  (None, 200, 128)         2585600   
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_2 (Transf  (None, 200, 128)         429184    
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_2   (None, 128)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_10 (Dropout)        (None, 128)               0   

In [24]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, batch_size=32, epochs=10, validation_data=test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
