In [3]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In this notebook I try to solve the multi-class classification problem using simpler methods. First I vectorize the words in sentences using TF-IDF method, then I apply multiple models and compare their results.

In [4]:
dataset_file_path = './data/sportoclanky.csv'
assert os.path.exists(dataset_file_path)
df = pd.read_csv(dataset_file_path)

In [5]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['category_enc'] = labelencoder.fit_transform(df['category'])
no_classes = len(df['category_enc'].unique())


I have found libraries `stop_words` and `simplelemma` that can help preprocess the Czech texts.

In [6]:
from stop_words import get_stop_words

stop_words = get_stop_words('cz')

Now we can gently preprocess the corpora:
1. Convert to lowercase and remove punctuations and characters and then strip;
2. Remove the stop words to exclude them from polluting our future vectors;
3. Lemmatize the words to make sure that similar words are "grouped" into their base form. Another option is to use stemming, it is a cheaper but less intelligent operation. In 

In [7]:
import re, string
import simplemma

def stopword(string):
    non_stop = [i for i in string.split() if i not in stop_words]
    return ' '.join(non_stop)

def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace 
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

def lemmatize(text):
    lemmas = [simplemma.lemmatize(t, lang='cs') for t in text]
    return " ".join(lemmas)

def complete_preprocessing(string):
    return lemmatize(stopword(preprocess(string)))

To make it a little simpler I concatenate the `rss_title` and `rss_perex` into one piece of text.

In [8]:
df['text'] = df['rss_title'] + ' ' + df['rss_perex']
df['clean_text'] = df['text'].apply(lambda x: complete_preprocessing(x))

In [1]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], 
                                                    df['category_enc'], 
                                                    stratify=df['category_enc'], 
                                                    random_state=42,
                                                    test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train, 
                                                    stratify=y_train, 
                                                    random_state=42,
                                                    test_size=0.25) # 0.25 x 0.8 = 0.2   

NameError: name 'df' is not defined

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#TF-IDF
# tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2))
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)  # not applying fit() the TfidfVectorizer to the test data
# to not allow changing the word-indices and weight based on the test data


In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

models = [
    RandomForestClassifier(n_estimators=400, max_depth=20, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0, solver='saga', max_iter = 400),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X_train_vectors_tfidf, y_train, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])



In [33]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.412586,0.000205
LogisticRegression,0.41195,0.000384
MultinomialNB,0.412622,3.3e-05
RandomForestClassifier,0.392325,0.002263


Unfortunately, I could make the algorithms converge due to the time constraints (changing the solver and increasing the number of iterations for the case of Logistic Regression), the model produces only the trivial results by predicting the largest class. 
However, I believe that hyperparameter tuning might produce better results.

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer(min_df=1)
X_vectors_count = vectorizer.fit_transform(df['clean_text'])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_vectors_count, 
                                                    df['category_enc'], 
                                                    stratify=df['category_enc'], 
                                                    random_state=42,
                                                    test_size=0.2)

X_train, X_val, y_train, y_val = train_test_split(X_vectors_count,
                                                    df['category_enc'], 
                                                    stratify=df['category_enc'], 
                                                    random_state=42,
                                                    test_size=0.25) # 0.25 x 0.8 = 0.2

In [43]:
clf = RandomForestClassifier(n_estimators=400)
clf.fit(X_train, y_train)

In [45]:
from sklearn import metrics  
y_pred = clf.predict(X_val)
print(f"Model's balanced accuracy: {metrics.balanced_accuracy_score(y_val, y_pred)}")
print(f"Model's accuracy: {metrics.accuracy_score(y_val, y_pred)}")


Model's balanced accuracy: 0.04317505188830834
Model's accuracy: 0.4066534795900018


Unfortunately, neither TF-IDF, nor count vectorization give satisfactory results. Further steps might include hyperparater tuning, or better using a classifier created for imbalanced datasets, such as `imblearn` or `catboost`.

Another (quite desperate) attempt to improve results can be translating text to English (e.g. using DeepL) and then applying classic verified methods for working with text.