# DATASET [20newsgropus](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html#sklearn.datasets.fetch_20newsgroups)

In [11]:
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Wybor 4 kategorii
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
# newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
# newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Wybor wszystkich
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')


# Tokenizacja 
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(newsgroups_train.data)
x_train = tokenizer.texts_to_sequences(newsgroups_train.data)
x_test = tokenizer.texts_to_sequences(newsgroups_test.data)

# Ustalenie tej samej dlugosci dla wszystkich danych
max_length = max(len(sequence) for sequence in x_train)
x_train = pad_sequences(x_train, maxlen=max_length)
x_test = pad_sequences(x_test, maxlen=max_length)

# One-hot encode kategorii
y_train = to_categorical(newsgroups_train.target)
y_test = to_categorical(newsgroups_test.target)


# Budowanie modelu
inputs = Input(shape=(max_length,))
x = Embedding(input_dim=10000, output_dim=100, input_length=max_length)(inputs)
x = Conv1D(filters=64, kernel_size=5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(units=32, activation='relu')(x)
outputs = Dense(units=20, activation='softmax')(x) #jak mniej kategorii to mniej neuronow na wyjsciu, zalezy od input.shape[1]

model = Model(inputs=inputs, outputs=outputs)

#Kompilacja i trenowanie
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa0263efd90>

In [None]:
import matplotlib.pyplot as plt

# Wykonaj predykcję dla 10 przypadków testowych
predicted_probabilities = model.predict(x_test[:10])
predicted_categories = np.argmax(predicted_probabilities, axis=1)

# Wyświetl dane wejściowe i dane przewidywane przez model
for i in range(len(predicted_categories)):
    print("Input:", newsgroups_test.data[i])
    print("True category:", newsgroups_train.target_names[newsgroups_test.target[i]])
    print("Predicted category:", newsgroups_train.target_names[predicted_categories[i]])
    print("-----------------------------")