In [None]:
!uv sync --group notebooks

In [None]:
import pandas as pd

df = pd.read_csv('data/data.csv', quoting=1)  # quoting=1 means QUOTE_ALL
df.head()

In [None]:
print("DataFrame shape:", df.shape)
print("\nMissing values per column:\n", df.isnull().sum())
print("\nFirst 5 rows:")
print(df.head())

print("\nBasic statistics for text column:")
print("Number of unique texts:", df['text'].nunique())
print("Average text length:", df['text'].str.len().mean())
print("Max text length:", df['text'].str.len().max())
print("Min text length:", df['text'].str.len().min())


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords = stopwords.words("russian")

In [None]:
text = ' '.join(df['text'].dropna().astype(str)).replace('\\n', ' ').lower()

# Generate a word cloud object, skipping stopwords and \n
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    collocations=False,
    stopwords=stopwords
).generate(text)

plt.imshow(wordcloud)
plt.axis('off')
plt.show()


In [None]:
total_words = df['text'].dropna().str.split().apply(len).sum()
print("Total number of words:", total_words)

all_words = ' '.join(df['text'].dropna().astype(str)).lower().split()
unique_words = set(all_words)
print("Number of unique words:", len(unique_words))

In [None]:
from collections import Counter


filtered_words = [word for word in all_words if word not in stopwords and word.isalpha()]
word_counts = Counter(filtered_words)
print("Top 10 most common words (excluding stopwords):")
for word, count in word_counts.most_common(10):
    print(f"{word}: {count}")

In [None]:
text_lengths = df['text'].dropna().str.split().apply(len)
print("Average text length (in words):", text_lengths.mean())
print("Max text length (in words):", text_lengths.max())
print("Min text length (in words):", text_lengths.min())