In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("text_data.csv")
df.head()

Unnamed: 0,text,topics
0,showers continued throughout the week in the b...,cocoa
1,the us agriculture department reported the far...,grain wheat corn barley oat sorghum
2,argentine grain board figures show crop regist...,veg-oil linseed lin-oil soy-oil sun-oil soybea...
3,champion products inc said its board of direct...,earn
4,computer terminal systems inc said it has comp...,acq


In [2]:
# process text

from data import vectorize_df, tokenize, vocabulary

max_words = 400

df = df[df["text"].apply(str.split).apply(len) <= max_words]

vectors, vocab_map = vectorize_df(df)
print(vectors.shape)
vectors

(9749, 400)


array([[    0,     0,     0, ..., 11976,  1332, 14985],
       [    0,     0,     0, ...,  9402, 17979, 14985],
       [    0,     0,     0, ..., 10291, 15952, 14985],
       ...,
       [    0,     0,     0, ..., 17704,  7721, 14985],
       [    0,     0,     0, ..., 19659,   404, 14985],
       [    0,     0,     0, ..., 16061, 16759, 14985]], dtype=int32)

In [3]:
# process topics
from data import topics_to_vectors
min_topic_count = 100

topics = topics_to_vectors(df, min_topic_count).values
print(topics.shape)
topics

(9749, 20)


array([[0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
# Model
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.models as models

VOCAB_SIZE = len(vocab_map)
INPUT_SIZE = max_words

model = models.Sequential([
    layers.Embedding(input_dim=VOCAB_SIZE, output_dim=256, input_length=INPUT_SIZE),
    layers.Dropout(0.2),
    layers.GRU(256, return_sequences=True),
    layers.Flatten(),
    layers.Dense(topics.shape[1], activation="sigmoid")    
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
model.summary()



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 400, 256)          5062656   
                                                                 
 dropout_6 (Dropout)         (None, 400, 256)          0         
                                                                 
 gru_6 (GRU)                 (None, 400, 256)          394752    
                                                                 
 flatten_2 (Flatten)         (None, 102400)            0         
                                                                 
 dense_6 (Dense)             (None, 20)                2048020   
                                                                 
Total params: 7505428 (28.63 MB)
Trainable params: 7505428 (28.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(vectors, topics, test_size=0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
model.evaluate(x_test, y_test)

(7799, 400)
(1950, 400)
(7799, 20)
(1950, 20)


[3.118577718734741, 0.09641025960445404]

In [22]:
history = model.fit(x_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50

KeyboardInterrupt: 