In [95]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.corpus import stopwords
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import pymorphy2
from nlpaug.augmenter.word import SynonymAug
from sklearn.decomposition import NMF
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder


In [49]:
train_df = pd.read_csv('dataset/train/train.tsv', sep='\t', header=None, names=['index_id', 'category', 'text'])
val_df = pd.read_csv('dataset/val/val.tsv', sep='\t', header=None, names=['index_id', 'category', 'text'])
test_df = pd.read_csv('dataset/test/test.tsv', sep='\t', header=None, names=['index_id', 'category', 'text'])

In [50]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /home/pekpuch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pekpuch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/pekpuch/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/pekpuch/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [51]:
# # убираем мусорные слова

# def remove_stop(text):
#     text = text.lower()
#     text = re.sub(r'\W', ' ', text)
#     words = text.split()
#     filtered_words = [word for word in words if word not in stop_words]
#     return ' '.join(filtered_words)

# train_df['text'] = train_df['text'].apply(remove_stop)

In [52]:
# pymorph

morph = pymorphy2.MorphAnalyzer()

stop_words = set(stopwords.words('russian'))
stop_words.add('это')
stop_words.add('который')

def preprocess_text(text):
    text = text.lower()  # Приведение к нижнему регистру
    text = re.sub(r'\W', ' ', text)  # Удаление всех неалфавитных символов
    words = text.split()  # Разбиение текста на слова
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    filtered_words = [word for word in lemmatized_words if word not in stop_words]
    return ' '.join(filtered_words)

train_df['clean_text'] = train_df['text'].apply(preprocess_text)

In [76]:
#Рассчет TF-IDF
tfidf_results = {}

classes = train_df['category'].unique()  

for cls in classes:
    texts = train_df[train_df['category'] == cls]['clean_text']
    
    # Рассчет TF-IDF
    tfidf = TfidfVectorizer()
    X_tfidf = tfidf.fit_transform(texts).toarray()
    
    # Получаем слова 
    feature_names = tfidf.get_feature_names_out()
    
    tfidf_scores = X_tfidf.sum(axis=0)  # Суммируем TF-IDF 
    tfidf_results[cls] = {feature_names[i]: tfidf_scores[i] for i in range(len(feature_names))}

for cls, scores in tfidf_results.items():
    print(f"\nКласс: {cls}")
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    
    for word, score in sorted_scores[:10]:  # Выводим топ-10 слов по TF-IDF
        print(f"{word}: {score}")



Класс: geography
остров: 2.221010044437218
миля: 1.7944958573032486
река: 1.6890969657845956
земля: 1.6277881084895456
являться: 1.617737613366442
км: 1.5206072804089168
океан: 1.4055018165152247
город: 1.1852191150330804
луна: 1.1486271454613397
большой: 1.130992553683145

Класс: science/technology
мочь: 4.388362740039852
состоять: 2.793980528912469
весь: 2.686870679656254
атом: 2.591225422831247
учёный: 2.490669956305439
всё: 2.350185446291186
солнце: 2.31022254241866
человек: 2.260610163342592
интернет: 2.243964335404921
использовать: 2.214642996062189

Класс: entertainment
год: 1.4433356502846948
хороший: 1.426908830436837
шоу: 1.2687678826544586
певец: 1.1306971706349844
также: 1.121456098205432
большинство: 1.1135116554598246
мочь: 1.0500691974241712
человек: 0.9883196645392918
группа: 0.9761922511831229
известный: 0.9680390165557149

Класс: politics
год: 3.698664134077792
свой: 3.00199166529293
правительство: 2.0982456866129695
война: 2.08930765944435
стать: 1.8003144817603478


In [107]:
vectorizer = CountVectorizer()
tfidf = TfidfVectorizer()
labels = open('dataset/data_rus_Cyrl_labels.txt').read().splitlines()
label_encoder = {'geography': 0, 'science/technology': 1, 'entertainment': 2, 'politics': 3, 'health': 4, 'travel': 5, 'sports': 6}

In [56]:
'''
MultinomialNB
Датасет без обработки
'''

model = MultinomialNB()

X_train_bow = vectorizer.fit_transform(train_df['text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.75      0.16      0.26        19
            travel       0.80      0.24      0.36        17
          politics       0.75      0.14      0.23        22
            sports       0.68      0.57      0.62        30
            health       0.49      0.86      0.63        51
     entertainment       0.69      0.44      0.54        25
         geography       0.49      0.75      0.59        40

          accuracy                           0.55       204
         macro avg       0.66      0.45      0.46       204
      weighted avg       0.62      0.55      0.51       204



In [114]:
'''
MultinomialNB
pymorphy2
'''
model = MultinomialNB()

X_train_bow = vectorizer.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.60      0.16      0.25        19
            travel       0.62      0.29      0.40        17
          politics       0.69      0.41      0.51        22
            sports       0.62      0.43      0.51        30
            health       0.42      0.69      0.52        51
     entertainment       0.65      0.44      0.52        25
         geography       0.44      0.62      0.52        40

          accuracy                           0.50       204
         macro avg       0.58      0.44      0.46       204
      weighted avg       0.54      0.50      0.48       204



In [113]:
'''
MultinomialNB
tfidf
'''

model = MultinomialNB()

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_tfidf = tfidf.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        19
            travel       0.00      0.00      0.00        17
          politics       1.00      0.09      0.17        22
            sports       0.67      0.13      0.22        30
            health       0.30      0.84      0.44        51
     entertainment       1.00      0.20      0.33        25
         geography       0.46      0.53      0.49        40

          accuracy                           0.37       204
         macro avg       0.49      0.26      0.24       204
      weighted avg       0.49      0.37      0.30       204



In [60]:
# Аугментация

aug = SynonymAug(aug_src='wordnet')
augmented_texts = [aug.augment(text) for text in train_df['text']]  
train_df_augmented = pd.concat([train_df, pd.DataFrame({'text': augmented_texts, 'category': train_df['category']})]) 
train_df_augmented['text'] = train_df_augmented['text'].apply(' '.join)

train_df_augmented['clean_text'] = train_df_augmented['text'].apply(preprocess_text)

In [116]:
'''
MultinomialNB
Аугментация
'''

model = MultinomialNB()

X_train_augmented = vectorizer.fit_transform(train_df_augmented['text']) 
y_train = train_df_augmented['category']

X_test_tfidf = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_augmented, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.75      0.16      0.26        19
            travel       0.80      0.24      0.36        17
          politics       0.75      0.14      0.23        22
            sports       0.65      0.57      0.61        30
            health       0.49      0.84      0.62        51
     entertainment       0.73      0.44      0.55        25
         geography       0.48      0.75      0.59        40

          accuracy                           0.54       204
         macro avg       0.67      0.45      0.46       204
      weighted avg       0.62      0.54      0.51       204



In [82]:
'''
MultinomialNB
PCA
'''

model = MultinomialNB()

nmf = NMF(n_components=100, max_iter=1000)


X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf.transform(test_df['text'])
X_train_nmf = nmf.fit_transform(X_train_tfidf) # PCA генерирует отрицательные значения
y_train = train_df['category']

X_test_nmf = nmf.transform(X_test_tfidf)
y_test = test_df['category']

model.fit(X_train_nmf, y_train)
y_pred = model.predict(X_test_nmf)

print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        19
            travel       0.00      0.00      0.00        17
          politics       0.00      0.00      0.00        22
            sports       0.00      0.00      0.00        30
            health       0.25      1.00      0.40        51
     entertainment       0.00      0.00      0.00        25
         geography       0.00      0.00      0.00        40

          accuracy                           0.25       204
         macro avg       0.04      0.14      0.06       204
      weighted avg       0.06      0.25      0.10       204



In [68]:
'''
логистическая регрессия
Датасет без обработки
'''

model = LogisticRegression()

X_train_bow = vectorizer.fit_transform(train_df['text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.67      0.11      0.18        19
            travel       0.86      0.35      0.50        17
          politics       0.25      0.05      0.08        22
            sports       0.67      0.53      0.59        30
            health       0.45      0.75      0.56        51
     entertainment       0.55      0.48      0.51        25
         geography       0.50      0.75      0.60        40

          accuracy                           0.51       204
         macro avg       0.56      0.43      0.43       204
      weighted avg       0.54      0.51      0.48       204



In [67]:
'''
логистическая регрессия
pymorphy2
'''
model = LogisticRegression()

X_train_bow = vectorizer.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       1.00      0.11      0.19        19
            travel       0.67      0.24      0.35        17
          politics       0.67      0.18      0.29        22
            sports       0.86      0.20      0.32        30
            health       0.29      0.90      0.44        51
     entertainment       0.88      0.28      0.42        25
         geography       0.50      0.20      0.29        40

          accuracy                           0.38       204
         macro avg       0.69      0.30      0.33       204
      weighted avg       0.62      0.38      0.34       204



In [86]:
'''
логистическая регрессия
tfidf
'''

model = LogisticRegression()

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_tfidf = tfidf.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        19
            travel       1.00      0.06      0.11        17
          politics       0.75      0.14      0.23        22
            sports       0.86      0.20      0.32        30
            health       0.28      0.90      0.43        51
     entertainment       1.00      0.24      0.39        25
         geography       0.46      0.28      0.34        40

          accuracy                           0.36       204
         macro avg       0.62      0.26      0.26       204
      weighted avg       0.57      0.36      0.30       204



In [66]:
'''
логистическая регрессия
Аугментация
'''

model = LogisticRegression()

X_train_augmented = vectorizer.fit_transform(train_df_augmented['text']) 
y_train = train_df_augmented['category']

X_test_tfidf = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_augmented, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.67      0.11      0.18        19
            travel       0.86      0.35      0.50        17
          politics       0.25      0.05      0.08        22
            sports       0.65      0.50      0.57        30
            health       0.47      0.75      0.58        51
     entertainment       0.60      0.48      0.53        25
         geography       0.46      0.78      0.58        40

          accuracy                           0.51       204
         macro avg       0.57      0.43      0.43       204
      weighted avg       0.54      0.51      0.47       204



In [83]:
'''
логистическая регрессия
PCA
'''

model = MultinomialNB()

pca = PCA(n_components=100)

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf.transform(test_df['text'])

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
y_train = train_df['category']

X_test_pca = pca.transform(X_test_tfidf.toarray())
y_test = test_df['category']

model = LogisticRegression()
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        19
            travel       0.00      0.00      0.00        17
          politics       1.00      0.09      0.17        22
            sports       0.86      0.20      0.32        30
            health       0.27      0.86      0.41        51
     entertainment       1.00      0.24      0.39        25
         geography       0.46      0.30      0.36        40

          accuracy                           0.34       204
         macro avg       0.51      0.24      0.24       204
      weighted avg       0.51      0.34      0.29       204



In [87]:
'''
Дерево решений
Датасет без обработки
'''

model = DecisionTreeClassifier()

X_train_bow = vectorizer.fit_transform(train_df['text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.20      0.11      0.14        19
            travel       0.25      0.18      0.21        17
          politics       0.22      0.18      0.20        22
            sports       0.69      0.30      0.42        30
            health       0.33      0.49      0.39        51
     entertainment       0.39      0.36      0.38        25
         geography       0.35      0.45      0.39        40

          accuracy                           0.34       204
         macro avg       0.35      0.29      0.30       204
      weighted avg       0.36      0.34      0.33       204



In [88]:
'''
Дерево решений
pymorphy2
'''
model = DecisionTreeClassifier()

X_train_bow = vectorizer.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.38      0.16      0.22        19
            travel       0.80      0.24      0.36        17
          politics       0.56      0.23      0.32        22
            sports       0.71      0.17      0.27        30
            health       0.28      0.90      0.43        51
     entertainment       0.60      0.12      0.20        25
         geography       0.62      0.12      0.21        40

          accuracy                           0.35       204
         macro avg       0.56      0.28      0.29       204
      weighted avg       0.53      0.35      0.30       204



In [89]:
'''
Дерево решений
tfidf
'''

model = DecisionTreeClassifier()

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
y_train = train_df['category']

X_test_tfidf = tfidf.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.50      0.16      0.24        19
            travel       0.67      0.24      0.35        17
          politics       0.71      0.23      0.34        22
            sports       0.50      0.17      0.25        30
            health       0.27      0.82      0.41        51
     entertainment       0.50      0.16      0.24        25
         geography       0.62      0.20      0.30        40

          accuracy                           0.35       204
         macro avg       0.54      0.28      0.31       204
      weighted avg       0.50      0.35      0.32       204



In [90]:
'''
Дерево решений
Аугментация
'''

model = DecisionTreeClassifier()

X_train_augmented = vectorizer.fit_transform(train_df_augmented['text']) 
y_train = train_df_augmented['category']

X_test_tfidf = vectorizer.transform(test_df['text'])
y_test = test_df['category']

model.fit(X_train_augmented, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.30      0.16      0.21        19
            travel       0.26      0.29      0.28        17
          politics       0.13      0.14      0.13        22
            sports       0.52      0.37      0.43        30
            health       0.30      0.35      0.32        51
     entertainment       0.42      0.40      0.41        25
         geography       0.35      0.40      0.37        40

          accuracy                           0.32       204
         macro avg       0.33      0.30      0.31       204
      weighted avg       0.33      0.32      0.32       204



In [111]:
'''
Дерево решений
PCA
'''

model = DecisionTreeClassifier()

pca = PCA(n_components=100)

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf.transform(test_df['text'])

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
y_train = train_df['category']

X_test_pca = pca.transform(X_test_tfidf.toarray())
y_test = test_df['category']

model = LogisticRegression()
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        19
            travel       0.00      0.00      0.00        17
          politics       1.00      0.05      0.09        22
            sports       0.75      0.20      0.32        30
            health       0.26      0.86      0.40        51
     entertainment       1.00      0.16      0.28        25
         geography       0.45      0.23      0.30        40

          accuracy                           0.31       204
         macro avg       0.49      0.21      0.20       204
      weighted avg       0.49      0.31      0.25       204



In [117]:
'''
XGBClassifier
Датасет без обработки
'''

model = XGBClassifier()

X_train_bow = vectorizer.fit_transform(train_df['text'])
y_train = train_df['category'].map(label_encoder)

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category'].map(label_encoder)

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.31      0.24      0.27        17
            travel       0.39      0.53      0.45        51
          politics       0.25      0.21      0.23        19
            sports       0.50      0.43      0.46        30
            health       0.20      0.09      0.12        22
     entertainment       0.49      0.60      0.54        40
         geography       0.55      0.44      0.49        25

          accuracy                           0.42       204
         macro avg       0.38      0.36      0.37       204
      weighted avg       0.40      0.42      0.40       204



In [108]:
'''
XGBClassifier
pymorphy2
'''

model = XGBClassifier()

X_train_bow = vectorizer.fit_transform(train_df['clean_text'])
y_train = train_df['category'].map(label_encoder)

X_test_bow = vectorizer.transform(test_df['text'])
y_test = test_df['category'].map(label_encoder)

model.fit(X_train_bow, y_train)
y_pred = model.predict(X_test_bow)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       1.00      0.18      0.30        17
            travel       0.30      0.82      0.44        51
          politics       0.46      0.32      0.38        19
            sports       0.60      0.20      0.30        30
            health       0.33      0.14      0.19        22
     entertainment       0.53      0.25      0.34        40
         geography       0.50      0.24      0.32        25

          accuracy                           0.37       204
         macro avg       0.53      0.31      0.33       204
      weighted avg       0.49      0.37      0.34       204



In [109]:
'''
XGBClassifier
tfidf
'''

model = XGBClassifier()

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
y_train = train_df['category'].map(label_encoder)

X_test_tfidf = tfidf.transform(test_df['text'])
y_test = test_df['category'].map(label_encoder)

model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       1.00      0.06      0.11        17
            travel       0.26      0.80      0.39        51
          politics       0.38      0.26      0.31        19
            sports       0.56      0.17      0.26        30
            health       0.38      0.14      0.20        22
     entertainment       0.33      0.07      0.12        40
         geography       0.83      0.20      0.32        25

          accuracy                           0.31       204
         macro avg       0.53      0.24      0.25       204
      weighted avg       0.47      0.31      0.26       204



In [106]:
'''
XGBClassifier
Аугментация
'''

model = XGBClassifier()

X_train_augmented = vectorizer.fit_transform(train_df_augmented['text']) 
y_train = train_df_augmented['category'].map(label_encoder)

X_test_tfidf = vectorizer.transform(test_df['text'])
y_test = test_df['category'].map(label_encoder)

model.fit(X_train_augmented, y_train)
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.36      0.29      0.32        17
            travel       0.37      0.49      0.42        51
          politics       0.31      0.26      0.29        19
            sports       0.50      0.33      0.40        30
            health       0.14      0.09      0.11        22
     entertainment       0.46      0.60      0.52        40
         geography       0.43      0.36      0.39        25

          accuracy                           0.39       204
         macro avg       0.37      0.35      0.35       204
      weighted avg       0.38      0.39      0.38       204



In [112]:
'''
XGBClassifier
PCA
'''

model = XGBClassifier()

pca = PCA(n_components=100)

X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf.transform(test_df['text'])

X_train_pca = pca.fit_transform(X_train_tfidf.toarray())
y_train = train_df['category'].map(label_encoder)

X_test_pca = pca.transform(X_test_tfidf.toarray())
y_test = test_df['category'].map(label_encoder)

model = LogisticRegression()
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

print(classification_report(y_test, y_pred, target_names=labels, zero_division=0)) 

                    precision    recall  f1-score   support

science/technology       0.00      0.00      0.00        17
            travel       0.26      0.82      0.39        51
          politics       0.00      0.00      0.00        19
            sports       0.86      0.20      0.32        30
            health       1.00      0.09      0.17        22
     entertainment       0.41      0.28      0.33        40
         geography       1.00      0.20      0.33        25

          accuracy                           0.32       204
         macro avg       0.50      0.23      0.22       204
      weighted avg       0.50      0.32      0.27       204

