# Лабораторна робота №3, Кривохата Марія, ІМ-21

In [1]:
import pandas as pd
import numpy as np
import re
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import FastText
from sklearn.model_selection import GridSearchCV

In [81]:
data_df = pd.read_csv("bbc-news-data2.csv")
data_df.head(10)

Unnamed: 0.1,Unnamed: 0,category,content
0,1497,sport,Bolton boss Sam Allardyce has signed Roma def...
1,2046,tech,"Aid workers trying to house, feed and clothe ..."
2,881,entertainment,Clint Eastwood's Million Dollar Baby beat Mar...
3,1423,sport,Chelsea left-back Wayne Bridge could miss the...
4,1680,sport,"Last Saturday, one newspaper proclaimed that ..."
5,996,politics,"Welsh councils should set their taxes at ""rea..."
6,1432,sport,Barcelona's pursuit of the Spanish title took...
7,591,entertainment,The man who said he got Oscar-nominated movie...
8,408,business,UK-based bank Standard Chartered said it woul...
9,1531,sport,Hearts of Oak set up an all Ghanaian Confeder...


In [82]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [83]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

In [84]:
def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    return text

data_df['clean_text'] = data_df['content'].apply(preprocess_text)
data_df.head(10)

Unnamed: 0.1,Unnamed: 0,category,content,clean_text
0,1497,sport,Bolton boss Sam Allardyce has signed Roma def...,bolton boss sam allardyce signed roma defender...
1,2046,tech,"Aid workers trying to house, feed and clothe ...",aid workers trying house feed clothe millions ...
2,881,entertainment,Clint Eastwood's Million Dollar Baby beat Mar...,clint eastwoods million dollar baby beat marti...
3,1423,sport,Chelsea left-back Wayne Bridge could miss the...,chelsea leftback wayne bridge could miss rest ...
4,1680,sport,"Last Saturday, one newspaper proclaimed that ...",last saturday one newspaper proclaimed england...
5,996,politics,"Welsh councils should set their taxes at ""rea...",welsh councils set taxes reasonable levels giv...
6,1432,sport,Barcelona's pursuit of the Spanish title took...,barcelonas pursuit spanish title took blow sun...
7,591,entertainment,The man who said he got Oscar-nominated movie...,man said got oscarnominated movie aviator grou...
8,408,business,UK-based bank Standard Chartered said it woul...,ukbased bank standard chartered said would spe...
9,1531,sport,Hearts of Oak set up an all Ghanaian Confeder...,hearts oak set ghanaian confederation cup fina...


In [85]:
print("Розподіл категорій:")
print(data_df['category'].value_counts())

Розподіл категорій:
category
sport            256
business         255
politics         209
tech             200
entertainment    193
Name: count, dtype: int64


In [86]:
# Токенізація для FastText
data_df['tokenized_text'] = data_df['clean_text'].apply(word_tokenize)

In [87]:
# Поділ даних на навчальні та тестові набори
X_train, X_test, y_train, y_test = train_test_split(
    data_df['clean_text'],
    data_df['category'],
    test_size=0.3,
    random_state=47,
    stratify=data_df['category'] # для збереження пропорцій в тестовій та навчальній групі
)

In [88]:
# Створення токенізованих корпусів для FastText
tokenized_train = [word_tokenize(text) for text in X_train]
tokenized_test = [word_tokenize(text) for text in X_test]

results = {}

In [89]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    print(f"Accuracy for {model_name}: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    return accuracy, report

### 1. Сумка слів з Логістичною Регресією

In [90]:
start_time = time.time()
bow_lr_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(min_df=2, max_df=0.95)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=47))
])
bow_lr_pipeline.fit(X_train, y_train)
bow_lr_time = time.time() - start_time
bow_lr_accuracy, bow_lr_report = evaluate_model(bow_lr_pipeline, X_test, y_test, "BoW + LR")
results["BoW + LR"] = {"accuracy": bow_lr_accuracy, "time": bow_lr_time}

Accuracy for BoW + LR: 0.9461
               precision    recall  f1-score   support

     business       0.91      0.91      0.91        76
entertainment       0.96      0.95      0.96        58
     politics       0.92      0.94      0.93        63
        sport       0.97      1.00      0.99        77
         tech       0.97      0.93      0.95        60

     accuracy                           0.95       334
    macro avg       0.95      0.95      0.95       334
 weighted avg       0.95      0.95      0.95       334



### 2. Сумка слів з Градієнтним Бустингом

In [91]:
start_time = time.time()
bow_gb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(min_df=2, max_df=0.95)),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=47))
])
bow_gb_pipeline.fit(X_train, y_train)
bow_gb_time = time.time() - start_time
bow_gb_accuracy, bow_gb_report = evaluate_model(bow_gb_pipeline, X_test, y_test, "BoW + GB")
results["BoW + GB"] = {"accuracy": bow_gb_accuracy, "time": bow_gb_time}

Accuracy for BoW + GB: 0.9311
               precision    recall  f1-score   support

     business       0.89      0.93      0.91        76
entertainment       0.93      0.90      0.91        58
     politics       0.92      0.92      0.92        63
        sport       0.95      0.99      0.97        77
         tech       0.98      0.90      0.94        60

     accuracy                           0.93       334
    macro avg       0.93      0.93      0.93       334
 weighted avg       0.93      0.93      0.93       334



### 3. FastText з Логістичною Регресією

In [92]:
# Створення FastText моделі
ft_model = FastText(
    tokenized_train,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1,
    epochs=10
)

# Функція для векторизації документів з використанням FastText
def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)

    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        for word in words:
            if word in vocabulary:
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                for tokenized_sentence in corpus]
    return np.array(features)

# Отримання векторів для навчального та тестового наборів
ft_num_features = 100
ft_train_features = document_vectorizer(tokenized_train, ft_model, ft_num_features)
ft_test_features = document_vectorizer(tokenized_test, ft_model, ft_num_features)

In [93]:
start_time = time.time()
ft_lr = LogisticRegression(max_iter=1000, random_state=47)
ft_lr.fit(ft_train_features, y_train)
ft_lr_time = time.time() - start_time
ft_lr_accuracy, ft_lr_report = evaluate_model(ft_lr, ft_test_features, y_test, "FastText + LR")
results["FastText + LR"] = {"accuracy": ft_lr_accuracy, "time": ft_lr_time}

Accuracy for FastText + LR: 0.9341
               precision    recall  f1-score   support

     business       0.90      0.91      0.90        76
entertainment       0.93      0.95      0.94        58
     politics       0.91      0.94      0.92        63
        sport       1.00      0.95      0.97        77
         tech       0.93      0.93      0.93        60

     accuracy                           0.93       334
    macro avg       0.93      0.93      0.93       334
 weighted avg       0.94      0.93      0.93       334



### 4. FastText з Градієнтним Бустингом

In [94]:
start_time = time.time()
ft_gb = GradientBoostingClassifier(n_estimators=100, random_state=47)
ft_gb.fit(ft_train_features, y_train)
ft_gb_time = time.time() - start_time
ft_gb_accuracy, ft_gb_report = evaluate_model(ft_gb, ft_test_features, y_test, "FastText + GB")
results["FastText + GB"] = {"accuracy": ft_gb_accuracy, "time": ft_gb_time}

Accuracy for FastText + GB: 0.9371
               precision    recall  f1-score   support

     business       0.92      0.88      0.90        76
entertainment       0.95      0.91      0.93        58
     politics       0.90      0.97      0.93        63
        sport       1.00      0.96      0.98        77
         tech       0.92      0.97      0.94        60

     accuracy                           0.94       334
    macro avg       0.94      0.94      0.94       334
 weighted avg       0.94      0.94      0.94       334



## Покращення із GridSearchCV

### 5. Cумка слів + Логістична Регресія + GridSearchCV

In [95]:
bow_lr_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__max_df': [0.9, 0.95],
    'classifier__C': [0.1, 1.0, 10.0]
}

start_time = time.time()
bow_lr_grid = GridSearchCV(bow_lr_pipeline, bow_lr_params, cv=3, n_jobs=-1, verbose=1)
bow_lr_grid.fit(X_train, y_train)
bow_lr_grid_time = time.time() - start_time

print(f"Найкращі параметри для BoW + LR: {bow_lr_grid.best_params_}")
bow_lr_grid_accuracy, bow_lr_grid_report = evaluate_model(bow_lr_grid, X_test, y_test, "BoW + LR (оптимізована)")
results["BoW + LR (оптимізована)"] = {"accuracy": bow_lr_grid_accuracy, "time": bow_lr_grid_time}

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Найкращі параметри для BoW + LR: {'classifier__C': 10.0, 'vectorizer__max_df': 0.9, 'vectorizer__ngram_range': (1, 2)}
Accuracy for BoW + LR (оптимізована): 0.9521
               precision    recall  f1-score   support

     business       0.91      0.91      0.91        76
entertainment       0.98      0.97      0.97        58
     politics       0.92      0.95      0.94        63
        sport       0.97      1.00      0.99        77
         tech       0.98      0.93      0.96        60

     accuracy                           0.95       334
    macro avg       0.95      0.95      0.95       334
 weighted avg       0.95      0.95      0.95       334



### 6. Cумка слів + Градієнтний Бустинг + GridSearchCV

In [96]:
bow_gb_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__n_estimators': [50, 100],
    'classifier__learning_rate': [0.05, 0.1, 0.5]
}

start_time = time.time()
bow_gb_grid = GridSearchCV(bow_gb_pipeline, bow_gb_params, cv=3, n_jobs=-1, verbose=1)
bow_gb_grid.fit(X_train, y_train)
bow_gb_grid_time = time.time() - start_time

print(f"Найкращі параметри для BoW + GB: {bow_gb_grid.best_params_}")
bow_gb_grid_accuracy, bow_gb_grid_report = evaluate_model(bow_gb_grid, X_test, y_test, "BoW + GB (оптимізована)")
results["BoW + GB (оптимізована)"] = {"accuracy": bow_gb_grid_accuracy, "time": bow_gb_grid_time}

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Найкращі параметри для BoW + GB: {'classifier__learning_rate': 0.5, 'classifier__n_estimators': 100, 'vectorizer__ngram_range': (1, 1)}
Accuracy for BoW + GB (оптимізована): 0.9371
               precision    recall  f1-score   support

     business       0.89      0.95      0.92        76
entertainment       0.95      0.90      0.92        58
     politics       0.93      0.90      0.92        63
        sport       0.96      0.99      0.97        77
         tech       0.97      0.93      0.95        60

     accuracy                           0.94       334
    macro avg       0.94      0.93      0.94       334
 weighted avg       0.94      0.94      0.94       334



### 7. FastText + Логістична Регресія + GridSearchCV

In [97]:
ft_lr_params = {
    'C': [0.1, 1.0, 10.0],
    'solver': ['liblinear', 'lbfgs']
}

start_time = time.time()
ft_lr_grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=47),
                          ft_lr_params, cv=3, n_jobs=-1, verbose=1)
ft_lr_grid.fit(ft_train_features, y_train)
ft_lr_grid_time = time.time() - start_time

print(f"Найкращі параметри для FastText + LR: {ft_lr_grid.best_params_}")
ft_lr_grid_accuracy, ft_lr_grid_report = evaluate_model(ft_lr_grid, ft_test_features, y_test, "FastText + LR (оптимізована)")
results["FastText + LR (оптимізована)"] = {"accuracy": ft_lr_grid_accuracy, "time": ft_lr_grid_time}

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Найкращі параметри для FastText + LR: {'C': 10.0, 'solver': 'liblinear'}
Accuracy for FastText + LR (оптимізована): 0.9401
               precision    recall  f1-score   support

     business       0.94      0.89      0.92        76
entertainment       0.93      0.93      0.93        58
     politics       0.90      0.97      0.93        63
        sport       1.00      0.97      0.99        77
         tech       0.92      0.93      0.93        60

     accuracy                           0.94       334
    macro avg       0.94      0.94      0.94       334
 weighted avg       0.94      0.94      0.94       334



### 8. FastText + Градієнтний Бустинг + GridSearchCV

In [98]:
ft_gb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1, 0.5],
    'max_depth': [3, 5]
}

start_time = time.time()
ft_gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=47),
                          ft_gb_params, cv=3, n_jobs=-1, verbose=1)
ft_gb_grid.fit(ft_train_features, y_train)
ft_gb_grid_time = time.time() - start_time

print(f"Найкращі параметри для FastText + GB: {ft_gb_grid.best_params_}")
ft_gb_grid_accuracy, ft_gb_grid_report = evaluate_model(ft_gb_grid, ft_test_features, y_test, "FastText + GB (оптимізована)")
results["FastText + GB (оптимізована)"] = {"accuracy": ft_gb_grid_accuracy, "time": ft_gb_grid_time}

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Найкращі параметри для FastText + GB: {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 50}
Accuracy for FastText + GB (оптимізована): 0.9401
               precision    recall  f1-score   support

     business       0.91      0.88      0.89        76
entertainment       0.95      0.95      0.95        58
     politics       0.91      0.95      0.93        63
        sport       1.00      0.96      0.98        77
         tech       0.94      0.97      0.95        60

     accuracy                           0.94       334
    macro avg       0.94      0.94      0.94       334
 weighted avg       0.94      0.94      0.94       334



### Порівняння моделей

In [99]:
models = list(results.keys())
accuracies = [results[model]["accuracy"] for model in models]
times = [results[model]["time"] for model in models]

comparison_df = pd.DataFrame({
    'Модель': models,
    'Точність': accuracies,
    'Час навчання (сек)': times
})
print("\nТаблиця порівняння моделей:")
print(comparison_df.sort_values(by='Точність', ascending=False))


Таблиця порівняння моделей:
                         Модель  Точність  Час навчання (сек)
4       BoW + LR (оптимізована)  0.952096            4.902465
0                      BoW + LR  0.946108            0.304563
6  FastText + LR (оптимізована)  0.940120            0.211054
7  FastText + GB (оптимізована)  0.940120           57.230155
3                 FastText + GB  0.937126           18.407771
5       BoW + GB (оптимізована)  0.937126           86.715112
2                 FastText + LR  0.934132            0.028332
1                      BoW + GB  0.931138            8.746239
