In [None]:
from collections import Counter
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import torch

# 1. Data Preparation

### Loading Data

In [None]:
# Loading data into pandas dataframes
data_lrt = pd.read_csv("Data/data_lrt.csv", index_col=0)
data_15min = pd.read_csv("Data/data_15min.csv", index_col=0)

In [None]:
data_lrt.head()

In [None]:
data_15min.head()

In [None]:
print(f"lrt.lt:{data_lrt.shape[0]}, 15min.lt:{data_15min.shape[0]}")

- Both tables have source, category, date, title, score and last_updated columns.
- There are 133437 entries from lrt.lt data and 72107 entries from 15min.lt

In [None]:
# Joining both dataframes
df = pd.concat([data_lrt, data_15min]).reset_index(drop=True)
df

### Changing data types and converting categories

In [None]:
df.info()

##### No columns have missing values but source, category, date and last_updated columns have incorrect data types

In [None]:
# Changing columns' data types
df["source"] = df["source"].astype("category")
df["category"] = df["category"].astype("category")
df["date"] = pd.to_datetime(df["date"])
df["last_updated"] = pd.to_datetime(df["last_updated"])

In [None]:
df.info()

##### Now all columns have correct data types

In [None]:
# Converting categories
# "pozicija" category only exists in lrt.lt website and it's structure is the same as "nuomones" category, so it is converted to
# "nuomones category"
df.loc[df["category"]=="pozicija", "category"] = "nuomones"

# "lrt-tyrimai" category only exists in lrt.lt website and it's structure is similar to "kriminalai" category from 15min.lt
# website, so it is converted to "kriminalai" category
df.loc[df["category"]=="lrt-tyrimai", "category"] = "kriminalai"

df["category"] = df["category"].cat.remove_categories(["pozicija", "lrt-tyrimai"])

In [None]:
df["category"].unique().to_list(), len(df["category"].unique().to_list())

##### There are 15 categories now

In [None]:
# For this project only two columns will be needed: category and title
df_cat = df.copy()[["category", "title"]]
df_cat.head()

### Removing title duplicates and keeping categories of interest

In [None]:
def keep_rare_duplicates(df):
    """Remove duplicated titles while keeping the less popular category"""
    # Calculating the number of titles for every category in a dataframe
    sorted_cat = list(df["category"].value_counts().sort_values(ascending=True).index)
    cat_num = {}
    for num, cat in enumerate(sorted_cat):
        cat_num[cat] = num
    
    # Finding duplicated titles
    duplicated_titles = df[df.duplicated("title")]["title"].values
    
    # If title was put in different categories, keeping the less popular one
    for t in duplicated_titles:
        smaller_cat = None
        rows = df[df["title"] == t]["category"]
        indexes = list(rows.index)
        categories = list(rows.values)
        for c in categories:
            if not smaller_cat:
                smaller_cat = c
            else:
                if cat_num[c] < cat_num[smaller_cat]:
                    smaller_cat = c
        df.loc[indexes, "category"] = smaller_cat
    
    return df[~df.duplicated("title")].reset_index(drop=True)  # Fully removing duplicates

In [None]:
def remove_title_duplicates_set_categories(df, cat_to_keep):
    """Only keep categories of interest and remove duplicated titles"""
    # Keeping categories of interest
    df["category"] = df["category"].cat.set_categories(cat_to_keep)
    df = df.dropna()
    
    # Droping absolute duplicates
    df = df[~df.duplicated(["category", "title"])].reset_index(drop=True)
    
    # Changing remaining duplicates' categories to the ones that are less popular in the dataframe
    df = keep_rare_duplicates(df)
    
    return df

In [None]:
# Number of titles for each category
df_cat["category"].value_counts()

In [None]:
# Choosing which categories to keep
categories_to_keep = ["verslas", "sportas", "kultura", "mokslas-ir-it",
                      "nuomones", "eismas", "kriminalai", "sveikata", "muzika"]

##### Choosing 9 most distinguishable categories, shown above, out of 15 because:
- categories 'lietuvoje', 'pasaulyje', 'veidai' and 'gyvenimas' are too abstract
- 'tavo-lrt' category only exists on lrt.lt website
- 'maistas' category has too few data (120 titles)

In [None]:
# Removing duplicates from data
data = remove_title_duplicates_set_categories(df_cat, categories_to_keep)

In [None]:
print(f"Number of duplicated titles:{data[data.duplicated('title')].shape[0]}")

### Finding top 10 similar titles' distribution between categories

In [None]:
# Setting up a pandas dataframe
ex_data = data.copy()
ex_data[categories_to_keep] = 0
ex_data.head()

In [None]:
# Downloading sentence transformer fitted for Lithuanian language
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
# Getting all titles into the list
corpus = list(ex_data["title"].values)

In [None]:
# Encoding titles
# !TAKES 40 MINUTES! Set 'proceed_en = True' if you want to encode titles from the start. Else, cell below loads the encoded
# titles
proceed_en = False
if proceed_en:
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

    with open("encoder_multi.sv", "wb") as f: 
        pickle.dump(corpus_embeddings, f)

In [None]:
# Finding top 10 titles and their categories
# !TAKES MORE THAN 6 HOURS! Set 'proceed_top10 = True' if you want to calculate the results from the start. Else, dataframe
# with calculated results will be loaded in - 2. Data Analysis - section
proceed_top10 = False
if proceed_top10:
    top_k = 10 + 1
    cat_pos = dict(zip(categories_to_keep, [categories_to_keep.index(i) for i in categories_to_keep]))
    results_form = [0 for i in range(len(categories_to_keep))]
    t = 0

    for title in corpus:
        query_embedding = embedder.encode(title, convert_to_tensor=True)

        # Using cosine-similarity and torch.topk to find the highest 10 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        results = results_form.copy()
        t += 1
        print(t)
        for idx in top_results[1][1:]:
            results[cat_pos[ex_data[ex_data["title"] == corpus[idx]]["category"].values[0]]] += 1

        ex_data.loc[ex_data[ex_data["title"] == title].index[0], categories_to_keep] = results
    
    ex_data.to_csv("Data/pur_df.csv")
    pur_df = ex_data.copy()

In [None]:
# Loading dataframe with top 10 titles' distribution between categories included
pur_df = pd.read_csv("Data/pur_df.csv", index_col=0)
pur_df["category"] = pur_df["category"].astype("category")

### Adding two more features

In [None]:
# Adding length of title
pur_df["title_length"] = pur_df["title"].apply(lambda x: len(x))

In [None]:
# Adding number of words in a title
pur_df["word_count"] = pur_df["title"].apply(lambda x: len(x.split()))

#### In this section:
- data was loaded
- categories transformed
- duplicates removed
- additional features created

# 2. Data Analysis

In [None]:
# Final dataframe
pur_df.head()

In [None]:
pur_df.info()

In [None]:
plt.style.use("ggplot")
plt.figure(figsize=(11, 6))
pur_df["category"].value_counts().sort_values(ascending=True).plot(kind="barh")
plt.xlabel("Total Number of Titles")
plt.ylabel("Categories")
plt.title("Distribution of Titles Between Categories in the Final Dataset")
plt.show()

##### Final dataset has 99499 titles, 25 thousands or 1/4 of them are placed in 'verslas' category.

##### Other popular categories are 'sportas' and 'kultura' having around 15 000 titles each.

##### Least common categories are 'muzika' and 'sveikata' having around 5 000 titles each.

In [None]:
pur_df[["title_length", "word_count"]].describe()

##### Average length of all titles is 75 characters and there are 10 words on average in them.

In [None]:
# Calculating mean length of a title between categories
s_tl = pur_df.groupby("category")["title_length"].mean().sort_index()
s_tl

In [None]:
# Calculating mean number of words in a title between categories
s_wc = pur_df.groupby("category")["word_count"].mean().sort_index()
s_wc

In [None]:
sns.set_palette(["red", "orange", "yellow", "green", "cyan", "blue", "purple", "pink", "black"])
sns.scatterplot(x=s_wc, y=s_tl, hue=s_wc.index)
plt.legend(loc=4)
plt.xlabel("Average Number of Words in a Title")
plt.ylabel("Average Length of a Title")
plt.title("Average Title Length and Number of Words Between Categories")
plt.show()

##### Linear trend can be seen between average length of a title and number of words in it.

##### The largest average titles are in categories 'sveikata' and 'muzika' while the shortest are in 'nuomones' category.

In [None]:
plt.figure(figsize=(15, 9))
np.random.seed(420)
sns.scatterplot(x=pur_df["word_count"]+np.random.uniform(low=-0.4, high=0.4, size=pur_df.shape[0]),
                y=pur_df["title_length"], hue=pur_df["category"], alpha=0.2, size=0.1)
plt.xlabel("Number of Words in a Title")
plt.ylabel("Length of a Title")
plt.title("Distribution of Titles by Number of Words and It's Length")
plt.show()

##### Linear trend can be seen again.
##### Majority of titles have between 4 and 19 words and their length is between 25 and 140 characters.

In [None]:
# Calculating top 10 similar titles' distribution between categories for each category
stat_tab = pur_df[pur_df.columns[:-2]].groupby("category").sum().reset_index()

# Calculating total number of scores for each category
stat_tab["total"] = stat_tab[categories_to_keep].sum(axis=1)
stat_tab

In [None]:
# Converting scores to percentages within each category
stat_tab[categories_to_keep] = stat_tab.apply(lambda x: x[categories_to_keep]/x["total"]*100, axis=1).apply(lambda x: x.round(1))
stat_tab = stat_tab.set_index("category").drop("total", axis=1)
stat_tab

In [None]:
plt.figure(figsize=(15, 9))

c_map = sns.color_palette(["red", "orange", "yellow", "green", "cyan", "blue", "purple", "pink", "lime"])
ordered_cols = [r[0] for r in sorted([(cat, stat_tab.loc[cat, cat]) for cat in categories_to_keep],
                                     key=lambda x: x[1], reverse=True)]
cat_color = dict(zip(ordered_cols, [i[0] for i in enumerate(ordered_cols)]))


for cat in ordered_cols:
    row = stat_tab.loc[cat, :].sort_values(ascending=False)
    idxs = row.index
    vals = row.values
    for i, tup in enumerate(zip(idxs, vals)):
        plt.bar(x=cat, height=tup[1], bottom=sum(vals[:i]), label=tup[0], color=c_map[cat_color.get(tup[0])])
        if not i:
            plt.text(s=str(tup[1]), x=tup[0], y=tup[1]/2, rotation=90, fontsize=20, ha="center", va="center")


plt.legend(stat_tab.loc[ordered_cols[0], :].sort_values(ascending=False).index, loc=1)
plt.hlines(y=50, xmin=-0.5, xmax=len(ordered_cols)-0.5, colors=[0.15, 0.2, 0], linestyle="dashed")
plt.text(s="50%", x=-0.7, y =50, ha="center", va="center", fontsize=14)
plt.title("Average Distribution Between Categories of Top 10 Closest Titles for Every Title in a Category Group", fontdict={"size":20})
plt.xlabel("Categories", fontdict={"size":18})
plt.ylabel("Average Distribution, %", fontdict={"size":18})
plt.show()

##### It can be seen that titles from 'sportas' category should be the easiest to distinguish from the others - more than 3/4 of all titles from this category had a title from the same category very similar to itself.
##### 'verslas' category has the second highest score here even though it is noticeably mixed up with 'eismas', 'mokslas-it-it', 'sveikata'  and 'nuomones' categories. The reason for it may be the fact that this category has a huge number of titles in the dataset - around 25 %, so it was less difficult for titles to find a similar one from the same category.
##### 'muzika' category is mixed up with 'kultura' the most and the later tend to be similar with 'nuomones' category.
##### 'eismas' is mixed up with 'kriminalai' - it may be due to the fact that incidents often involve traffic.
##### Titles from 'sveikata' category find similarity with titles from 'mokslas-ir-it' the most.
##### The chart suggests that the hardest task will be to distinguish titles from 'nuomones' category because it mixes up with other categories the most. This may be the truth because titles from this category can be on many topics and the reason for them to be classified as 'nuomones' is that it represents someone's opinion.

# 3. Model

### Finding useful features for the model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import FeatureUnion, Pipeline
import scipy

In [None]:
try:
    from imblearn.over_sampling import RandomOverSampler
except:
    from imblearn.over_sampling import RandomOverSampler

In [None]:
def split_data(df, X_title_col, X_num_cols, target_col, use_oversampler):
    """Split data into train and test datasets"""
    if not X_num_cols:  # Only 'title' feature should be in the final X datasets
        # Splitting data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(df[X_title_col], df[target_col], test_size=0.3, random_state=420)
    
    else:  # Numerical features should be included in the final X datasets
        # Splitting data into train and test sets
        X_cols = X_num_cols.copy()
        X_cols.append(X_title_col)
        X_train, X_test, y_train, y_test = train_test_split(df[X_cols], df[target_col], test_size=0.3, random_state=420)
    
    # Checking if oversampling should be used
    if use_oversampler == "RandOv":
        ovs = RandomOverSampler(random_state=420)
        X_train, y_train = ovs.fit_resample(X_train, y_train)
    
    return X_train, X_test, y_train, y_test

##### Splitting data into training and testing feature sets and their target labels, in order for it to be in a suitable format for the model and to have 'unseen' data to test the model with.
##### Oversampling increases the amount of total titles in the dataset for smaller categories so that the share for each of them would be equal.

In [None]:
def vectorize_titles(X_train, X_test, X_title_col, X_num_cols):
    """Convert titles to vectors"""
    vectorizer = CountVectorizer()
    
    if not X_num_cols:  # X datasets contain only 'title' feature
        X_train_vect = vectorizer.fit_transform(X_train.values)
        X_test_vect = vectorizer.transform(X_test.values)
    
    else:  # X datasets contain more than one feature
        X_train_vect = vectorizer.fit_transform(X_train[X_title_col].values)
        X_test_vect = vectorizer.transform(X_test[X_title_col].values)
    
    return X_train_vect, X_test_vect

##### Processing titles from text to numeric data - vectors, so it could be used in the model.

In [None]:
def scale_features(X_train, X_test):
    """Scale numerical features"""
    sc = StandardScaler()
    
    X_train_scaled = sc.fit_transform(X_train)
    X_test_scaled = sc.transform(X_test)
    
    return X_train_scaled, X_test_scaled

##### Scaling numeric features helps the model to better fit the data.

In [None]:
def prepare_data(df, X_title_col, X_scale_cols, X_non_scale_cols, target_col, use_oversampler):
    """Preparing data for model"""
    # Checking if there are more features in X datasets excluding 'title'
    X_num_cols = X_scale_cols.copy()
    X_num_cols += X_non_scale_cols
    
    # Splitting data into train and test sets
    X_train, X_test, y_train, y_test = split_data(df, X_title_col, X_num_cols, target_col, use_oversampler)
    
    # Vectorizing titles
    X_train_vect, X_test_vect = vectorize_titles(X_train, X_test, X_title_col, X_num_cols)
    
    if X_scale_cols:  # Scalable features should be included in the final X datasets
        # Scaling numerical features
        X_train_scaled, X_test_scaled = scale_features(X_train[X_scale_cols], X_test[X_scale_cols])
        
        # Combining vectorized titles and scaled features
        X_train_final = scipy.sparse.hstack([X_train_vect, X_train_scaled])
        X_test_final = scipy.sparse.hstack([X_test_vect, X_test_scaled])
        
    else:  # No scalable features the X datasets
        X_train_final, X_test_final = X_train_vect, X_test_vect
    
    if X_non_scale_cols:  # Additional numerical features should be included in the final X datasets
        X_train_final = scipy.sparse.hstack([X_train_final, X_train[X_non_scale_cols]])
        X_test_final = scipy.sparse.hstack([X_test_final, X_test[X_non_scale_cols]])
    
    # Converting target labels from strings to numbers
    cat_to_label = dict(zip(categories_to_keep, [i[0] for i in enumerate(categories_to_keep)]))
    y_train = np.array(y_train.map(cat_to_label).values)
    y_test = np.array(y_test.map(cat_to_label).values)
    
    return X_train_final, X_test_final, y_train, y_test

##### Using previous 3 functions to convert the data into suitable format for the model.

In [None]:
def model_and_stats(df, X_title_col="title", X_scale_cols: list = [], X_non_scale_cols: list = [], target_col="category",
                   use_oversampler: ["None", "RandOv"] = "None"):
    """Train model and get results"""
    # Getting data
    X_train, X_test, y_train, y_test = prepare_data(df, X_title_col, X_scale_cols, X_non_scale_cols, target_col, use_oversampler)
    
    # Creating and training the model
    lr = LogisticRegression(multi_class="ovr", random_state=420, max_iter=500)
    lr.fit(X_train, y_train)
    
    # Getting predictions and calculating model performance results
    y_pred = lr.predict(X_test)
    train_acc = lr.score(X_train, y_train)
    test_acc = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"Model's accuracy on training data: {train_acc:.5f}")
    print(f"Model's accuracy on test data: {test_acc:.5f}")
    print(f"Model's f1_score: {test_f1:.5f}")

##### Getting data, creating and training the model, calculating results. These functions were created to help compare the performance of  models trained with different data.

#### Now for better understanding the steps of training the benchmark model will be explained

In [None]:
# Everytime data will be split into 70% training and 30% testing datasets.
# The same (random_state) will be used for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(pur_df["title"], pur_df["category"], test_size=0.3,
                                                    random_state=420)

In [None]:
X_train.size, y_train.size, X_test.size, y_test.size

##### Train set has 69649 titles and test has 29850 titles from every category.

In [None]:
# 10% of titles ditribution between categories in the train dataset
y_train.value_counts()/7  # dividing by 7 because train set is 7 parts of the whole dataset

In [None]:
# 10% of titles ditribution between categories in the test dataset
y_test.value_counts()/3  # dividing by 3 because test set is 3 parts of the whole dataset

In [None]:
(y_train.value_counts()/7) / (y_test.value_counts()/3)

##### Both datasets have very similar distribution of titles between categories - scores are near 1.

In [None]:
# This dictionary will be used to encode categories
cat_to_label = dict(zip(categories_to_keep, [i[0] for i in enumerate(categories_to_keep)]))
cat_to_label

In [None]:
# Encoding categories
y_train = np.array(y_train.map(cat_to_label).values)
y_test = np.array(y_test.map(cat_to_label).values)

In [None]:
y_train

In [None]:
# Creating count vectorizer and converting titles in the train dataset to vectors
vectorizer = CountVectorizer()

X_train_vect = vectorizer.fit_transform(X_train.values)

In [None]:
X_train_vect

In [None]:
vectorizer.get_feature_names_out()

##### The title column was converted to 80012 columns for each set of characters, for example '000' or 'zyniai', that was found in titles.

In [None]:
# Converting titles in the test dataset to vectors
X_test_vect = vectorizer.transform(X_test)

In [None]:
X_test_vect

In [None]:
# Creating model - multiclass logistic regression
# (random_state) will be used for reproducibility and (max_iter) is increased, in order for the model to have enough iterations
# to fit the data
lr = LogisticRegression(multi_class="ovr", random_state=420, max_iter=500)

In [None]:
# Training the model with train dataset
lr.fit(X_train_vect, y_train)

In [None]:
# Using the model to predict the categories for titles in the test dataset
y_pred = lr.predict(X_test_vect)
y_pred

In [None]:
y_test

##### It is visible that first 3 and last 3 titles were categorized correctly but scoring metrics should be used to get the model's performance

In [None]:
# Calculating the accuracy score that model reaches then predicting categories of titles from train dataset
lr.score(X_train_vect, y_train)

In [None]:
# Calculating the accuracy score that model reaches then predicting categories of titles from test dataset
accuracy_score(y_test, y_pred)

In [None]:
# Calculating the f1 score that model reaches then predicting categories of titles from test dataset
f1_score(y_test, y_pred, average="weighted")

##### Benchmark model has 0.87052 accuracy and 0.87011 f1 score - the higher the better

##### Using the functions defined earlier to get the performance results of models

In [None]:
# Benchmark model
model_and_stats(pur_df)

##### It can be seen that results are the same - functions work! Now models will be trained with different features.

In [None]:
# Adding top 10 titles distribution between categories, using RandomOverSampler
model_and_stats(pur_df, X_scale_cols=categories_to_keep, use_oversampler="RandOv")

In [None]:
# Adding title length and word count, using RandomOverSampler
model_and_stats(pur_df, X_scale_cols=["title_length", "word_count"], use_oversampler="RandOv")

In [None]:
# Adding top 10 titles distribution between categories, title length and word count, using RandomOverSampler
model_and_stats(pur_df, X_scale_cols=list(pur_df.columns[2:]), use_oversampler="RandOv")

##### The best results are reached when using features title, title length, word count and random over sampler:
- 0.87300 accuracy score
- 0.87344 f1 score

### Pipeline and GridSearchCV

In [None]:
# Splitting the data using only the best features
X_train, X_test, y_train, y_test = split_data(pur_df, "title", ["title_length", "word_count"], "category", "RandOv")

In [None]:
def return_title_column(X):
    """Returns column 'title'"""
    return X["title"]

In [None]:
def return_numeric_columns(X):
    """Returns columns 'title_length' and 'word_count'"""
    return X[["title_length", "word_count"]]

In [None]:
# Two functions above will be used in the pipeline to be able to transform text and numerical data differently but they need
# to have fit and fit_transform methods. Function transformer change them for this purpose.
get_text_data = FunctionTransformer(return_title_column, validate=False)
get_numeric_data = FunctionTransformer(return_numeric_columns, validate=False)

In [None]:
# Using feature union to be able to transform text and numerical data differently and then combine them together in the pipeline
process_and_join_features = FeatureUnion(transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('scaler', StandardScaler())  # Scaling numerical data ('title_length', 'word_count') with standard scaler
                ])),
                ('text_features', Pipeline([
                    ("selector", get_text_data),
                    ("vectorizer", CountVectorizer())  # Converting text data ('title') to vectors
                ]))])

In [None]:
# Joining all steps - from data transformation to model creation - in one object: pipeline
pipe = Pipeline([
        ('union', process_and_join_features),
        ('lr', LogisticRegression(multi_class="ovr", random_state=420, max_iter=500))
    ])

In [None]:
# Defining parameters that GridSerchCV should try out and find the best ones for the model
params = {
    "union__text_features__vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3), (2, 3)],
    "union__text_features__vectorizer__max_df": [0.3, 0.1, 0.01, 0.005],
    "lr__penalty": ["l2", "none"]
}

In [None]:
# Creating GridSearchCV object, using 5-fold cross-validation and accuracy as a score metric
gs = GridSearchCV(estimator=pipe, param_grid=params, scoring="accuracy", cv=5, verbose=3)

In [None]:
# Training GridSeachCV object
# !TAKES A FEW HOURS! Set 'proceed_gs = True' if you want to calculate the results from the start.
proceed_gs = False
if proceed_gs:
    gs.fit(X_train, y_train)

In [None]:
if proceed_gs:
    gs.best_params_

In [None]:
if proceed_gs:
    gs.best_score_

##### Best results are achieved by using:
- CountVectorizer:
    - ngram_range = (1, 2)
    - max_df = 0.3
- LogisticRegression:
    - penalty = 'l2'

### Creating and training a model with the best parameters

In [None]:
process_and_join_features = FeatureUnion(transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('scaler', StandardScaler())
                ])),
                ('text_features', Pipeline([
                    ("selector", get_text_data),
                    ("vectorizer", CountVectorizer(ngram_range=(1, 2), max_df=0.3))  # Setting ngram_range=(1, 2) and max_df=0.3
                ]))])

In [None]:
pipe = Pipeline([
        ('union', process_and_join_features),
        ('lr', LogisticRegression(multi_class="ovr", random_state=420, max_iter=500)) # penalty='l2' is default, no need to specify
    ])

In [None]:
# Training the model
pipe.fit(X_train, y_train)

In [None]:
# Predicting categories for titles in the test dataset
y_pred = pipe.predict(X_test)

In [None]:
pipe.score(X_train, y_train)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred, average="weighted")

##### Comparing final and benchmark models
- 87.481 % vs. 87.052 %: accuracy score increased by 0.429 %
- 87.487 % vs. 87.011 %: f1 score increased by 0.476 %

In [None]:
# Saving final model locally
with open("../FlaskApp/model.pkl", "wb") as file:
    pickle.dump(pipe, file)

## Final Notes

In [None]:
# Calculating how many titles were asigned to different categories
# Getting all titles from categories in (categories_to_keep) list
t = df_cat[df_cat["category"].isin(categories_to_keep)].copy()


# Finding combinations of categories from titles that were calssified in multiple categories
t_c_d = ["+".join(sorted(list(v))) for v in t[t.duplicated("title", keep=False)]\
         .groupby("title")["category"].unique().values if len(v) > 1]

In [None]:
# Calculating the frequencies of multi-categories combinations
Counter(t_c_d), print(len(t_c_d))

##### 3911 titles were asigned to multiple categories, most frequently:
- 'mokslas-ir-it' and 'verslas'
- 'eismas' and 'verslas'
- 'kultura' and 'muzika'

##### 3911 out of 99499 or almost 4 % of titles in the final datset were previously classified in multiple categories. This shows that the task to assign a title to only one category isn't that easy even for the authors.