In [2]:
import tensorflow as tf
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '15'
import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
import re
import string
from keras.metrics import CategoricalAccuracy, F1Score

In [2]:
base_dir = '6-persian-topics'

In [3]:
num = os.listdir(base_dir)

In [4]:
data_features = []
data_targets = []
for n in num:
    class_path = os.path.join(base_dir , n)
    file_path = os.listdir(class_path)
    for p in file_path:
        matn = os.path.join(class_path , p)
        file = open(matn , 'r')
        file_r = file.read()
        if len(file_r) < 20:
            os.remove(matn)
        else:
            data_features.append(matn)
            data_targets.append(num.index(n))
        file.close()

In [5]:
train_f , val_f , train_t , val_t = train_test_split(data_features , data_targets , random_state=42 , test_size= 0.2)


In [6]:
len(data_features)

37253

In [7]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  stripped_ye = tf.strings.regex_replace(stripped_html, 'ي', 'ی')
  stripped_ke = tf.strings.regex_replace(stripped_ye, 'ك', 'ک')
  stripped_alef = tf.strings.regex_replace(stripped_ke, 'آ', 'ا')
  stripped_english = tf.strings.regex_replace(stripped_alef, '[a-zA-Z]', ' ')
  return tf.strings.regex_replace(stripped_alef,
                                  '[%s]' % re.escape(string.punctuation),'')

In [8]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)


2024-05-10 12:06:59.237153: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-10 12:06:59.258064: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-10 12:06:59.258267: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [9]:
AUTOTUNE = tf.data.AUTOTUNE

def load_text(path, lbl):
    lbl = tf.one_hot(lbl, depth=6, dtype='float32')
    text = tf.io.read_file(path)
    return text, lbl

def vectorize_text(text, lbl):
    text = vectorize_layer(text)
    return text, lbl

train_ds = tf.data.Dataset.from_tensor_slices((train_f, train_t))
val_ds = tf.data.Dataset.from_tensor_slices((val_f, val_t))

train_ds = train_ds.map(load_text)
vectorize_layer.adapt(train_ds.map(lambda text, label: text))
train_ds = train_ds.map(vectorize_text)

val_ds = val_ds.map(load_text)
val_ds = val_ds.map(vectorize_text)

train_ds = train_ds.shuffle(1000).batch(128)
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().batch(128).prefetch(buffer_size=AUTOTUNE)


In [10]:
embedding_dim = 16

In [11]:
model = tf.keras.Sequential([
  layers.Embedding(max_features, embedding_dim),
  layers.GlobalAveragePooling1D(),
  layers.Dense(100),
  layers.Dropout(0.2),
  layers.Dense(6)])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 100)               1700      
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 6)                 606       
                                                                 
Total params: 162306 (634.01 KB)
Trainable params: 162306 (634.01 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [12]:
metrics = metrics = [
    CategoricalAccuracy(name='accuracy'),
    F1Score(name='f1-score')
]


In [13]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(0.01),
              metrics=metrics)

In [14]:
train_iterator = train_ds.as_numpy_iterator()

class_counts = {}
class_indices = {}


class_index = 0
for data, labels in train_iterator:
    for label in labels:
        label_tuple = tuple(label)
        if label_tuple not in class_counts:
            class_counts[label_tuple] = 1
            class_indices[label_tuple] = class_index
            class_index += 1
        else:
            class_counts[label_tuple] += 1

total_samples = sum(class_counts.values())


class_weights = {
    class_indices[label]: total_samples / count
    for label, count in class_counts.items()
}



In [15]:
history = model.fit(train_ds, epochs=10,
                    validation_data=val_ds,
                    validation_steps=30,
                    class_weight=class_weights)

Epoch 1/10


2024-05-10 12:07:41.607302: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-10 12:07:42.197535: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f67b0cea3a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-10 12:07:42.197561: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce MX330, Compute Capability 6.1
2024-05-10 12:07:42.201337: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-10 12:07:42.212299: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1715357262.271048    5924 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10

2024-05-10 12:07:59.798828: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 3/10

2024-05-10 12:08:00.634762: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 4/10

2024-05-10 12:08:01.477522: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 5/10

2024-05-10 12:08:02.321517: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 6/10

2024-05-10 12:08:03.162359: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 7/10

2024-05-10 12:08:04.024716: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 8/10

2024-05-10 12:08:04.918740: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 9/10

2024-05-10 12:08:05.804500: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Epoch 10/10

2024-05-10 12:08:06.729117: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.




2024-05-10 12:08:07.596962: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [16]:
result = model.evaluate(val_ds)
print('Evaluation result:', result)

Evaluation result: [0.15711252391338348, 0.9740974307060242, array([0.92470276, 0.7432432 , 0.9912179 , 0.9930527 , 0.7167235 ,
       0.95582026], dtype=float32)]


In [17]:
predictions = model.predict(val_ds)
predicted_classes = tf.math.argmax(predictions, axis=-1)



In [18]:
from sklearn.metrics import classification_report
report = classification_report(val_t, predicted_classes)

print(report)

              precision    recall  f1-score   support

           0       0.96      0.89      0.92       393
           1       0.73      0.75      0.74        73
           2       0.99      0.99      0.99      4339
           3       0.99      0.99      0.99      1224
           4       0.58      0.93      0.72       113
           5       0.96      0.95      0.96      1309

    accuracy                           0.97      7451
   macro avg       0.87      0.92      0.89      7451
weighted avg       0.98      0.97      0.98      7451

