In [41]:
import pandas as pd
from datetime import datetime
import re
import os

# Импорт данных 

In [10]:
df = pd.read_csv('df.csv')

# Лемматизация

In [17]:
import io
f = io.open(r'stopwords-ru.txt','r',encoding='utf8')
sw = f.read()
sw = sw.split('\n')

In [18]:
import string

def clean_tweet(tweet):
    temp = str(tweet)
    temp = temp.lower()
    temp = re.sub("#[A-Za-zА-Яа-я0-9_]+","", temp)
    temp = re.sub("\n"," ", temp)
    temp =  re.sub('[^а-яё ]', '', temp, flags=re.IGNORECASE)
    temp = temp.split()
    temp = [w for w in temp if not w in sw]
    temp = " ".join(word for word in temp)
    return temp

In [19]:
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()

def lemmatize(doc):
    tokens = []
    for token in doc.split():
            token = token.strip()
            token = morph.normal_forms(token)[0]
            tokens.append(token)
    tokens = " ".join(word for word in tokens if word not in sw)
    return tokens

In [20]:
df['Lemma'] = df['Текст'].apply(clean_tweet)
df['Lemma'] = df['Lemma'].apply(lemmatize)

# Частотный анализ

In [None]:
# Униграммы
top_w = pd.DataFrame(pd.Series(' '.join(df['Lemma']).split()).value_counts())
top_w.head(50)

In [None]:
# Если частотный анализ показывает лишние слова, не включенные в изначальный список стоп-слов, можно удалить их отдельно
add_sw = ['...']
df['Lemma'] = df['Lemma'].apply(lambda x: ' '.join([word for word in x.split() if word not in add_sw]))

In [None]:
# Биграммы
import nltk

tokens = ' '.join(df['Lemma']).split()
bigrams = nltk.bigrams(tokens)
frequence = nltk.FreqDist(bigrams)

freq = pd.DataFrame(frequence.items(), columns=['word', 'frequency'])
freq = freq.sort_values(by=['frequency'], ascending=False)
freq['word'] = freq['word'].apply(lambda x: ' '.join(x))
freq.head(50)

# Анализ тональности

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast
import time
import numpy as np

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

In [None]:
def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted

In [None]:
df['sentiment'] = df['Текст'].apply(predict)
df['sentiment'] = df['sentiment'].apply(lambda x: x[0])

In [None]:
fig = px.bar(df.groupby('sentiment').size().reset_index(), 
             x="sentiment", y=0, color = 'sentiment',
             color_discrete_sequence = ['blue', 'green', 'red'])
fig.show()

# Тематическое моделирование

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

In [None]:
embedding_model = SentenceTransformer('intfloat/multilingual-e5-large')

In [None]:
topic_model = BERTopic(
    embedding_model = embedding_model,
    n_gram_range = (1, 2),
    nr_topics = 'auto',
    calculate_probabilities = False,
    verbose = True
)

In [None]:
docs = list(df['Lemma'].dropna())

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_barchart()