In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

Считываем комментарии РПЛ:

In [2]:
rpl_comments = pd.read_csv('rpl_comments.csv', on_bad_lines='skip')
rpl_comments = rpl_comments.sample(frac=1, random_state=42)
check_rpl_comments = rpl_comments[1000:2000]['comment']
rpl_comments = rpl_comments[:1000]

Считываем комментарии АПЛ:

In [3]:
apl_comments = pd.read_csv('apl_comments.csv', on_bad_lines='skip')
apl_comments = apl_comments.sample(frac=1, random_state=42)
apl_comments = apl_comments[:1000]
check_apl_comments = pd.read_csv('check_apl_comments.txt', on_bad_lines='skip')
check_apl_comments = check_apl_comments.sample(frac=1, random_state=42)
check_apl_comments = check_apl_comments[:1000]['comment']

Объединяем в выборку для работы модели:

In [5]:
df = pd.concat([rpl_comments, apl_comments])
comments = df['comment']
labels = df['mark']

Разбиваем на обучающую и тестовую выборки:

In [6]:
x_train, x_test, y_train, y_test = train_test_split(comments, labels, test_size=0.25, random_state=42)
stop_words = set(stopwords.words('russian'))
morph = MorphAnalyzer()
vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w[\w-]*\w\b")

Создаём функцию предобработки комментария:

In [8]:
def process(comment):
    # Токенизация
    tokens = word_tokenize(comment.lower())
    # Удаление стоп-слов и пунктуации
    cleaned_tokens = [token for token in tokens if token not in stop_words and token not in punctuation]
    letter_tokens = []
    # Удаление всех неалфавитных знаков
    for token in cleaned_tokens:
        token = ''.join([c for c in token if c.isalpha()])
        if token:
            letter_tokens.append(token)
    # Лемматизация
    lemmatized = [morph.parse(token)[0].normal_form for token in letter_tokens]
    return ' '.join(lemmatized)

Предобработка комментариев и подготовка для использования их моделью:

In [9]:
x_train = [process(comment) for comment in x_train]
x_test = [process(comment) for comment in x_test]
check_rpl_comments = [process(comment) for comment in check_rpl_comments]
check_apl_comments = [process(comment) for comment in check_apl_comments]
x_train = vectorizer.fit_transform(x_train).toarray()
x_test = vectorizer.transform(x_test).toarray()
check_rpl_comments = vectorizer.transform(check_rpl_comments).toarray()
check_apl_comments = vectorizer.transform(check_apl_comments).toarray()

Обучение модели и проверка её качества:

In [11]:
model = MultinomialNB()
model.fit(x_train, y_train)
pred = model.predict(x_test)
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred, average="micro")
print(f"Accuracy: {acc}, f1: {f1}")

Accuracy: 0.804, f1: 0.804


Анализ выборки комментариев:

In [13]:
pred_rpl = model.predict(check_rpl_comments)
pred_apl = model.predict(check_apl_comments)
pos_rpl, pos_apl = 0, 0
for p in pred_rpl: pos_rpl += p
for p in pred_apl: pos_apl += p
print(f'Количество негативных комментариев болельщиков РПЛ: {1000-pos_rpl}')
print(f'Количество негативных комментариев болельщиков АПЛ: {1000-pos_apl}')

Количество негативных комментариев болельщиков РПЛ: 39
Количество негативных комментариев болельщиков АПЛ: 91
