In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz


In [3]:
import os
import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

Number of directories: 20
Directory names: ['rec.sport.baseball', 'sci.electronics', 'sci.crypt', 'talk.religion.misc', 'comp.windows.x', 'soc.religion.christian', 'talk.politics.mideast', 'talk.politics.misc', 'sci.space', 'sci.med', 'rec.sport.hockey', 'comp.graphics', 'alt.atheism', 'talk.politics.guns', 'misc.forsale', 'rec.motorcycles', 'rec.autos', 'comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'comp.sys.ibm.pc.hardware']
Number of files in comp.graphics: 1000
Some example filenames: ['38320', '38548', '38894', '38732', '37963']


In [4]:
print(open(data_dir / "comp.graphics" / "38987").read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



In [5]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

Processing alt.atheism, 1000 files found
Processing comp.graphics, 1000 files found
Processing comp.os.ms-windows.misc, 1000 files found
Processing comp.sys.ibm.pc.hardware, 1000 files found
Processing comp.sys.mac.hardware, 1000 files found
Processing comp.windows.x, 1000 files found
Processing misc.forsale, 1000 files found
Processing rec.autos, 1000 files found
Processing rec.motorcycles, 1000 files found
Processing rec.sport.baseball, 1000 files found
Processing rec.sport.hockey, 1000 files found
Processing sci.crypt, 1000 files found
Processing sci.electronics, 1000 files found
Processing sci.med, 1000 files found
Processing sci.space, 1000 files found
Processing soc.religion.christian, 997 files found
Processing talk.politics.guns, 1000 files found
Processing talk.politics.mideast, 1000 files found
Processing talk.politics.misc, 1000 files found
Processing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [6]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

In [7]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [8]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [9]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   2, 3658, 1716,   15,    2, 6274])

In [10]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [11]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

[2, 3658, 1716, 15, 2, 6274]

In [12]:
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [13]:
x_train[0]

array([  323,     1,   586,     1,   162,  1297,   520,   168,   134,
           1,   176,    75,  2631,     7,    48,     1,     1,     1,
           1,    40,  9955,     1,     1,     1,     1,  8708,  4736,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1, 16401, 14316,     1,     1,
           1,     1,   920, 14574,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,     1,     1,     1,     1,     1,
           1,     1,     1,     1,   920, 14574,     1,     1,     1,
        8708,  2454,     1,   596,  2454,     1,     1,  2503,   333,
        5248,     1,     1,  8708,     1,     9,     1,  8708,  2454,
           1,     1,   333,     1,     1,     1,    60,     1,     1,
           1,     1,   956,  4711,   974,     1, 15307,     9,     1,
           1,   596,  2454,     1,  4711,  1685,  8708,     1,     1,
         143,  6474,

In [14]:
# tf.data.Dataset

def create_dataset(x, y):
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(100)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


train_ds = create_dataset(x_train, y_train)
test_ds = create_dataset(x_val, y_val)

In [15]:
for x, y in train_ds:
  print(x[0])
  print(y[0])
  break

tf.Tensor(
[  323     1   586     1   162  1297   520   168   134     1   176    75
  2631     7    48     1     1     1     1    40  9955     1     1     1
     1  8708  4736     1     1     1     1     1     1     1     1     1
     1     1     1     1     1 16401 14316     1     1     1     1   920
 14574     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1   920 14574     1     1     1  8708  2454     1
   596  2454     1     1  2503   333  5248     1     1  8708     1     9
     1  8708  2454     1     1   333     1     1     1    60     1     1
     1     1   956  4711   974     1 15307     9     1     1   596  2454
     1  4711  1685  8708     1     1   143  6474     1   488     1   596
  2454     1  8708  2503     1     1     1     1     9     1     1  1293
     1  7599     1     1  9045   333     1     9     1     1 15823     1
 19760     9     1     1     1  2503   3