In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path
import pandas as pd

SALARY_LEMMA_NAME = "hh_vacancies_salary_with_lemma.parquet"
DRIVE_DIR = Path("/content/drive/MyDrive/ColabNotebooks/FinalProject/ProcessedDatasets")

salary_path = DRIVE_DIR / SALARY_LEMMA_NAME

if salary_path.exists():
    df_salary = pd.read_parquet(salary_path)
else:
    from google.colab import files
    uploaded = files.upload()
    df_salary = pd.read_parquet(SALARY_LEMMA_NAME)

print("–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞:", df_salary.shape)

–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: (14131, 31)


In [3]:
df_cluster = df_salary.copy()

df_cluster["desc_lemma_len"] = df_cluster["description_lemma"].str.len()

MIN_LEN = 200
df_cluster = df_cluster[df_cluster["desc_lemma_len"] >= MIN_LEN].copy()

print(df_cluster["desc_lemma_len"].describe())
print("–†–∞–∑–º–µ—Ä –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞:", df_cluster.shape)

count    14006.000000
mean      1306.677924
std        691.152788
min        200.000000
25%        805.000000
50%       1204.500000
75%       1670.000000
max       8395.000000
Name: desc_lemma_len, dtype: float64
–†–∞–∑–º–µ—Ä –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞: (14006, 32)


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

custom_stopwords = [
    "–≥–æ–¥", "–∫–æ–º–ø–∞–Ω–∏—è", "—Ä–∞–±–æ—Ç–∞", "–æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç—å",
    "—Ç—Ä–µ–±–æ–≤–∞–Ω–∏–µ", "—É–º–µ–Ω–∏–µ", "–Ω–∞–≤—ã–∫", "–æ–ø—ã—Ç",
    "–≤—ã—Å–æ–∫–∏–π", "–¥–æ—Ö–æ–¥", "–∑–∞—Ä–∞–±–æ—Ç–Ω—ã–π", "–∑–∞—Ä–ø–ª–∞—Ç–∞",
    "–æ–ø–ª–∞—Ç–∞", "–≤–æ–∑–Ω–∞–≥—Ä–∞–∂–¥–µ–Ω–∏–µ"
]

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=20,
    max_df=0.6,
    sublinear_tf=True,
    stop_words=custom_stopwords
)

X = vectorizer.fit_transform(df_cluster["description_lemma"])
terms = np.array(vectorizer.get_feature_names_out())

print("TF-IDF shape:", X.shape)

TF-IDF shape: (14006, 16058)


In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

SAMPLE_SIZE = 3000

np.random.seed(42)
sample_idx = np.random.choice(X.shape[0], SAMPLE_SIZE, replace=False)
X_sample = X[sample_idx]

ks = [5, 6, 7, 8, 9, 10]
sil_scores = []

for k in ks:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_sample)
    sil = silhouette_score(X_sample, labels)
    sil_scores.append(sil)
    print(f"k={k}: silhouette={sil:.4f}")

best_k = ks[int(np.argmax(sil_scores))]
print("\nBest k by silhouette:", best_k)

k=5: silhouette=0.0061
k=6: silhouette=0.0071
k=7: silhouette=0.0093
k=8: silhouette=0.0096
k=9: silhouette=0.0102
k=10: silhouette=0.0104

Best k by silhouette: 10


In [6]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

k = 10
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_cluster["cluster"] = kmeans.fit_predict(X)

print("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º (n):")
display(df_cluster["cluster"].value_counts().sort_index().to_frame("count"))

terms = np.array(vectorizer.get_feature_names_out())
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

top_terms_per_cluster = []
TOP_N = 12

for i in range(k):
    top_terms = [terms[ind] for ind in order_centroids[i, :TOP_N]]
    top_terms_per_cluster.append({
        "cluster": i,
        "count": int((df_cluster["cluster"] == i).sum()),
        "top_terms": ", ".join(top_terms)
    })

top_terms_df = pd.DataFrame(top_terms_per_cluster).sort_values("cluster")
print("\n–¢–æ–ø-—Ç–µ—Ä–º—ã –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º:")
display(top_terms_df)

cluster_salary = (df_cluster
                  .groupby("cluster")["salary_mid"]
                  .agg(count="count", mean="mean", median="median")
                  .sort_values("median", ascending=False))

print("\n–ó–∞—Ä–ø–ª–∞—Ç—ã –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º (sorted by median):")
display(cluster_salary)

–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º (n):


Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
0,1506
1,1247
2,2701
3,2094
4,1326
5,1767
6,114
7,838
8,1931
9,482



–¢–æ–ø-—Ç–µ—Ä–º—ã –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º:


Unnamed: 0,cluster,count,top_terms
0,0,1506,"–ø—Ä–æ–¥–∞–∂–∞, –∫–ª–∏–µ–Ω—Ç, –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã, –±–∞–∑–∞, –º–µ–Ω–µ–¥–∂–µ—Ä, –∫..."
1,1,1247,"–æ–±—ä–µ–∫—Ç, —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π, –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è, —Å—Ç—Ä–æ–∏—Ç–µ–ª—å–Ω—ã..."
2,2,2701,"–¥–µ–Ω—å, –∞–≤—Ç–æ–º–æ–±–∏–ª—å, –º–µ—Å—Ç–æ, –º–µ—Å—è—Ü, –ø–ª–∞—Ç–∞, —Ä–∞–±–æ—á–∏–π..."
3,3,2094,"–¥–æ–∫—É–º–µ–Ω—Ç, –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞, –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è, –≤–µ–¥–µ–Ω–∏–µ, –∑–Ω..."
4,4,1326,"—Å–º–µ–Ω–∞, –≥–æ—Å—Ç—å, –∑–∞–∫–∞–∑, —á–∞—Å, –ø–∏—Ç–∞–Ω–∏–µ, –±–µ—Å–ø–ª–∞—Ç–Ω—ã–π,..."
5,5,1767,"—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞, –ø—Ä–æ–µ–∫—Ç, –∫–æ–º–∞–Ω–¥–∞, –ø–æ–Ω–∏–º–∞–Ω–∏–µ, –∑–∞–¥–∞—á–∞..."
6,6,114,"–Ω–µ–ø–æ–ª–Ω—ã–π —Ä–∞–±–æ—á–∏–π, –Ω–µ–ø–æ–ª–Ω—ã–π, –æ—Ç–¥—ã—Ö, –ø—Ä–æ–¥–∞–≤–µ—Ü –∫–∞..."
7,7,838,"–ø–∞—Ü–∏–µ–Ω—Ç, –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–π, –∫–ª–∏–Ω–∏–∫–∞, —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ—Å—Ç—å, ..."
8,8,1931,"–Ω–∞—à, –∫–ª–∏–µ–Ω—Ç, –º—ã, —á—Ç–æ, –∫–æ–º–∞–Ω–¥–∞, –∫–æ—Ç–æ—Ä—ã–π, –µ—Å–ª–∏, ..."
9,9,482,"–±—É—Ö–≥–∞–ª—Ç–µ—Ä—Å–∫–∏–π, —É—á—ë—Ç, –±—É—Ö–≥–∞–ª—Ç–µ—Ä, –Ω–∞–ª–æ–≥–æ–≤—ã–π, –¥–æ–∫..."



–ó–∞—Ä–ø–ª–∞—Ç—ã –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º (sorted by median):


Unnamed: 0_level_0,count,mean,median
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1247,158290.852322,150000.0
0,1506,160123.140491,147247.5
5,1767,158049.614233,140000.0
9,482,130828.533548,120000.0
4,1326,115450.352123,100000.0
3,2094,109669.195826,100000.0
8,1931,117727.34457,100000.0
7,838,118111.232757,100000.0
2,2701,111811.004235,95700.0
6,114,87245.755921,78300.0


In [7]:
q25 = df_cluster["salary_mid"].quantile(0.25)
q75 = df_cluster["salary_mid"].quantile(0.75)

print("Q25:", q25)
print("Q75:", q75)

def salary_group_func(x):
    if x <= q25:
        return "low"
    elif x >= q75:
        return "high"
    else:
        return "mid"

df_cluster["salary_group"] = df_cluster["salary_mid"].apply(salary_group_func)

print(df_cluster["salary_group"].value_counts())

Q25: 80000.0
Q75: 152250.0
salary_group
mid     6912
low     3559
high    3535
Name: count, dtype: int64


In [8]:
pd.crosstab(
    df_cluster["cluster"],
    df_cluster["salary_group"],
    normalize="index"
).round(3)

salary_group,high,low,mid
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.422,0.108,0.471
1,0.456,0.067,0.477
2,0.165,0.34,0.495
3,0.134,0.301,0.565
4,0.176,0.287,0.537
5,0.418,0.178,0.405
6,0.0,0.658,0.342
7,0.198,0.284,0.518
8,0.186,0.356,0.458
9,0.222,0.147,0.631


**–ú–µ—Ç–æ–¥–∏–∫–∞**

–î–ª—è –≤—ã—è–≤–ª–µ–Ω–∏—è –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω–æ–π —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –≤–∞–∫–∞–Ω—Å–∏–π –±—ã–ª–∞ –ø—Ä–æ–≤–µ–¥–µ–Ω–∞ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–æ–≤ –æ–ø–∏—Å–∞–Ω–∏–π –±–µ–∑ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –æ –∑–∞—Ä–∞–±–æ—Ç–Ω–æ–π –ø–ª–∞—Ç–µ (unsupervised learning).

–í –∫–∞—á–µ—Å—Ç–≤–µ —Ç–µ–∫—Å—Ç–æ–≤–æ–≥–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω–∏—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª–∞—Å—å TF-IDF-–≤–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –ª–µ–º–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏–π –≤–∞–∫–∞–Ω—Å–∏–π (—É–Ω–∏–≥—Ä–∞–º–º—ã –∏ –±–∏–≥—Ä–∞–º–º—ã).
–ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –≤—ã–ø–æ–ª–Ω–µ–Ω–∞ –º–µ—Ç–æ–¥–æ–º KMeans (k = 10, –≤—ã–±—Ä–∞–Ω –Ω–∞ –æ—Å–Ω–æ–≤–µ –∞–Ω–∞–ª–∏–∑–∞ silhouette score).

–ü–µ—Ä–µ–¥ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–µ–π –±—ã–ª–∏ –∏—Å–∫–ª—é—á–µ–Ω—ã –≤–∞–∫–∞–Ω—Å–∏–∏ —Å –¥–ª–∏–Ω–æ–π –ª–µ–º–º–∞—Ç–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω–æ–≥–æ —Ç–µ–∫—Å—Ç–∞ –º–µ–Ω–µ–µ 200 —Å–∏–º–≤–æ–ª–æ–≤ (–º–µ–Ω–µ–µ 1% –Ω–∞–±–ª—é–¥–µ–Ω–∏–π).

**–ò–Ω—Ç–µ—Ä–ø—Ä–µ—Ç–∞—Ü–∏—è –∫–ª–∞—Å—Ç–µ—Ä–æ–≤**

–ê–Ω–∞–ª–∏–∑ —Ç–æ–ø-—Ç–µ—Ä–º–æ–≤ —Ü–µ–Ω—Ç—Ä–æ–∏–¥–æ–≤ –ø–æ–∑–≤–æ–ª–∏–ª –≤—ã–¥–µ–ª–∏—Ç—å —Å–ª–µ–¥—É—é—â–∏–µ —Ç–∏–ø—ã –≤–∞–∫–∞–Ω—Å–∏–π:

üîπ –ö–ª–∞—Å—Ç–µ—Ä 1 ‚Äî –°—Ç—Ä–æ–∏—Ç–µ–ª—å–Ω–æ-—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –ø–æ–∑–∏—Ü–∏–∏

–•–∞—Ä–∞–∫—Ç–µ—Ä–Ω—ã–µ —Ç–µ—Ä–º–∏–Ω—ã: –æ–±—ä–µ–∫—Ç, —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π, –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è, —Å—Ç—Ä–æ–∏—Ç–µ–ª—å–Ω—ã–π.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 150 000 —Ä—É–±.
–ö–ª–∞—Å—Ç–µ—Ä —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–µ—Ç—Å—è –≤—ã—Å–æ–∫–æ–π –¥–æ–ª–µ–π –≤–∞–∫–∞–Ω—Å–∏–π —Å –≤—ã—Å–æ–∫–∏–º —É—Ä–æ–≤–Ω–µ–º –æ–ø–ª–∞—Ç—ã —Ç—Ä—É–¥–∞.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 0 ‚Äî –ö–æ–º–º–µ—Ä—á–µ—Å–∫–∏–µ –ø–æ–∑–∏—Ü–∏–∏ / –ø—Ä–æ–¥–∞–∂–∏

–¢–µ—Ä–º–∏–Ω—ã: –ø—Ä–æ–¥–∞–∂–∞, –∫–ª–∏–µ–Ω—Ç, –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã, –º–µ–Ω–µ–¥–∂–µ—Ä.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 147 000 —Ä—É–±.
–í—ã—Å–æ–∫–∞—è –∫–æ–Ω—Ü–µ–Ω—Ç—Ä–∞—Ü–∏—è –≤—ã—Å–æ–∫–æ–æ–ø–ª–∞—á–∏–≤–∞–µ–º—ã—Ö –≤–∞–∫–∞–Ω—Å–∏–π.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 5 ‚Äî –ü—Ä–æ–µ–∫—Ç–Ω–∞—è –¥–µ—è—Ç–µ–ª—å–Ω–æ—Å—Ç—å / IT

–¢–µ—Ä–º–∏–Ω—ã: —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞, –ø—Ä–æ–µ–∫—Ç, –∫–æ–º–∞–Ω–¥–∞, –∑–∞–¥–∞—á–∞.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 140 000 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 9 ‚Äî –ë—É—Ö–≥–∞–ª—Ç–µ—Ä–∏—è –∏ —É—á—ë—Ç

–¢–µ—Ä–º–∏–Ω—ã: –±—É—Ö–≥–∞–ª—Ç–µ—Ä—Å–∫–∏–π, —É—á—ë—Ç, –Ω–∞–ª–æ–≥–æ–≤—ã–π.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 120 000 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 4 ‚Äî –°—Ñ–µ—Ä–∞ –æ–±—Å–ª—É–∂–∏–≤–∞–Ω–∏—è / –æ–±—â–µ–ø–∏—Ç

–¢–µ—Ä–º–∏–Ω—ã: —Å–º–µ–Ω–∞, –≥–æ—Å—Ç—å, –∑–∞–∫–∞–∑, –ø–∏—Ç–∞–Ω–∏–µ.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 100 000 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 3 ‚Äî –ê–¥–º–∏–Ω–∏—Å—Ç—Ä–∞—Ç–∏–≤–Ω—ã–µ —Ñ—É–Ω–∫—Ü–∏–∏

–¢–µ—Ä–º–∏–Ω—ã: –¥–æ–∫—É–º–µ–Ω—Ç, –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞, –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 100 000 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 8 ‚Äî –í–∞–∫–∞–Ω—Å–∏–∏ —Å –º–∞—Ä–∫–µ—Ç–∏–Ω–≥–æ–≤–æ-–æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–º —Å—Ç–∏–ª–µ–º –æ–ø–∏—Å–∞–Ω–∏—è

–¢–µ—Ä–º–∏–Ω—ã: –Ω–∞—à, –º—ã, —Ç—ã, —á–µ–ª–æ–≤–µ–∫, —Ä–∞–±–æ—Ç–∞—Ç—å.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 100 000 —Ä—É–±.

–î–∞–Ω–Ω—ã–π –∫–ª–∞—Å—Ç–µ—Ä –æ–±—ä–µ–¥–∏–Ω—è–µ—Ç –≤–∞–∫–∞–Ω—Å–∏–∏, –≤ –æ–ø–∏—Å–∞–Ω–∏–∏ –∫–æ—Ç–æ—Ä—ã—Ö –ø—Ä–µ–æ–±–ª–∞–¥–∞—é—Ç –æ–±—â–∏–µ –∏ –∫–æ–º–º—É–Ω–∏–∫–∞—Ç–∏–≤–Ω—ã–µ —Ñ–æ—Ä–º—É–ª–∏—Ä–æ–≤–∫–∏, –æ—Ä–∏–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –Ω–∞ –ø—Ä–∏–≤–ª–µ—á–µ–Ω–∏–µ –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤, –∞ –Ω–µ –Ω–∞ –¥–µ—Ç–∞–ª—å–Ω–æ–µ –æ–ø–∏—Å–∞–Ω–∏–µ –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã—Ö –æ–±—è–∑–∞–Ω–Ω–æ—Å—Ç–µ–π.


üîπ –ö–ª–∞—Å—Ç–µ—Ä 7 ‚Äî –ú–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–µ –ø–æ–∑–∏—Ü–∏–∏

–¢–µ—Ä–º–∏–Ω—ã: –ø–∞—Ü–∏–µ–Ω—Ç, –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–π, –∫–ª–∏–Ω–∏–∫–∞.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 100 000 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 2 ‚Äî –†–∞–±–æ—á–∏–µ –ø–æ–∑–∏—Ü–∏–∏ / –ª–æ–≥–∏—Å—Ç–∏–∫–∞

–¢–µ—Ä–º–∏–Ω—ã: –¥–µ–Ω—å, –∞–≤—Ç–æ–º–æ–±–∏–ª—å, –º–µ—Å—Ç–æ, –º–µ—Å—è—Ü.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 95 700 —Ä—É–±.

üîπ –ö–ª–∞—Å—Ç–µ—Ä 6 ‚Äî –ß–∞—Å—Ç–∏—á–Ω–∞—è –∑–∞–Ω—è—Ç–æ—Å—Ç—å / –ø—Ä–æ–¥–∞–≤—Ü—ã

–¢–µ—Ä–º–∏–Ω—ã: –Ω–µ–ø–æ–ª–Ω—ã–π —Ä–∞–±–æ—á–∏–π, –ø—Ä–æ–¥–∞–≤–µ—Ü.
–ú–µ–¥–∏–∞–Ω–Ω–∞—è –∑–∞—Ä–ø–ª–∞—Ç–∞: 78 300 —Ä—É–±.
–ü—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ—Å—Ç–æ–∏—Ç –∏–∑ –≤–∞–∫–∞–Ω—Å–∏–π —Å –Ω–∏–∑–∫–∏–º —É—Ä–æ–≤–Ω–µ–º –æ–ø–ª–∞—Ç—ã —Ç—Ä—É–¥–∞.

**–°–≤—è–∑—å –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ —Å —É—Ä–æ–≤–Ω–µ–º –∑–∞—Ä–∞–±–æ—Ç–Ω–æ–π –ø–ª–∞—Ç—ã**

–ù–µ—Å–º–æ—Ç—Ä—è –Ω–∞ —Ç–æ —á—Ç–æ –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è –ø—Ä–æ–≤–æ–¥–∏–ª–∞—Å—å –±–µ–∑ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –ø–µ—Ä–µ–º–µ–Ω–Ω–æ–π –∑–∞—Ä–∞–±–æ—Ç–Ω–æ–π –ø–ª–∞—Ç—ã, –ø–æ–ª—É—á–µ–Ω–Ω—ã–µ –∫–ª–∞—Å—Ç–µ—Ä—ã –¥–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É—é—Ç —Ä–∞–∑–ª–∏—á–∏—è –≤ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–∏ —É—Ä–æ–≤–Ω–µ–π –æ–ø–ª–∞—Ç—ã —Ç—Ä—É–¥–∞.

–ö–ª–∞—Å—Ç–µ—Ä—ã —Å—Ç—Ä–æ–∏—Ç–µ–ª—å–Ω—ã—Ö, –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∏—Ö –∏ –ø—Ä–æ–µ–∫—Ç–Ω—ã—Ö –≤–∞–∫–∞–Ω—Å–∏–π —Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É—é—Ç—Å—è –Ω–∞–∏–±–æ–ª—å—à–∏–º–∏ –º–µ–¥–∏–∞–Ω–Ω—ã–º–∏ –∑–Ω–∞—á–µ–Ω–∏—è–º–∏ –∑–∞—Ä–ø–ª–∞—Ç—ã –∏ –≤—ã—Å–æ–∫–æ–π –¥–æ–ª–µ–π high-–≤–∞–∫–∞–Ω—Å–∏–π.

–ù–∞–ø—Ä–æ—Ç–∏–≤, –∫–ª–∞—Å—Ç–µ—Ä —á–∞—Å—Ç–∏—á–Ω–æ–π –∑–∞–Ω—è—Ç–æ—Å—Ç–∏ –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ—Å—Ç–æ–∏—Ç –∏–∑ low-–≤–∞–∫–∞–Ω—Å–∏–π.

**–°–≤—è–∑—å —Å —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞–º–∏ NLP-–∞–Ω–∞–ª–∏–∑–∞**

–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏–∏ —Å–æ–≥–ª–∞—Å—É—é—Ç—Å—è —Å —Ä–∞–Ω–µ–µ –ø—Ä–æ–≤–µ–¥—ë–Ω–Ω—ã–º NLP-–∞–Ω–∞–ª–∏–∑–æ–º:

–í –≤—ã—Å–æ–∫–æ–æ–ø–ª–∞—á–∏–≤–∞–µ–º—ã—Ö –∫–ª–∞—Å—Ç–µ—Ä–∞—Ö –¥–æ–º–∏–Ω–∏—Ä—É—é—Ç —É–ø—Ä–∞–≤–ª–µ–Ω—á–µ—Å–∫–∏–µ, –ø—Ä–æ–µ–∫—Ç–Ω—ã–µ –∏ –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∏–µ —Ç–µ—Ä–º–∏–Ω—ã (—É–ø—Ä–∞–≤–ª–µ–Ω–∏–µ, –ø—Ä–æ–¥–∞–∂–∏, –ø—Ä–æ–µ–∫—Ç, —Å—Ç—Ä–æ–∏—Ç–µ–ª—å—Å—Ç–≤–æ).

–í –Ω–∏–∑–∫–æ–æ–ø–ª–∞—á–∏–≤–∞–µ–º—ã—Ö –∫–ª–∞—Å—Ç–µ—Ä–∞—Ö –ø—Ä–µ–æ–±–ª–∞–¥–∞—é—Ç –æ–ø–µ—Ä–∞—Ü–∏–æ–Ω–Ω—ã–µ –∏ —Å–µ—Ä–≤–∏—Å–Ω—ã–µ —Ñ—É–Ω–∫—Ü–∏–∏ (–∫–∞—Å—Å–æ–≤—ã–µ –æ–ø–µ—Ä–∞—Ü–∏–∏, —Å–º–µ–Ω–Ω–∞—è —Ä–∞–±–æ—Ç–∞, –æ–±—Å–ª—É–∂–∏–≤–∞–Ω–∏–µ).

–¢–∞–∫–∏–º –æ–±—Ä–∞–∑–æ–º, —Ä–∞–∑–ª–∏—á–∏—è, –≤—ã—è–≤–ª–µ–Ω–Ω—ã–µ –ø—Ä–∏ –∞–Ω–∞–ª–∏–∑–µ –æ—Ç–¥–µ–ª—å–Ω—ã—Ö —Ç–µ–∫—Å—Ç–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (TF-IDF –∏ œá¬≤), –ø–æ–¥—Ç–≤–µ—Ä–∂–¥–∞—é—Ç—Å—è –Ω–∞ —É—Ä–æ–≤–Ω–µ –µ—Å—Ç–µ—Å—Ç–≤–µ–Ω–Ω–æ–π —Å—Ç—Ä—É–∫—Ç—É—Ä—ã –≤–∞–∫–∞–Ω—Å–∏–π.

–≠—Ç–æ —Å–≤–∏–¥–µ—Ç–µ–ª—å—Å—Ç–≤—É–µ—Ç –æ —Å–æ–≥–ª–∞—Å–æ–≤–∞–Ω–Ω–æ—Å—Ç–∏ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ —Ä–∞–∑–Ω—ã—Ö –º–µ—Ç–æ–¥–æ–≤ –∞–Ω–∞–ª–∏–∑–∞ –∏ —É—Å—Ç–æ–π—á–∏–≤–æ—Å—Ç–∏ –≤—ã—è–≤–ª–µ–Ω–Ω—ã—Ö –∑–∞–∫–æ–Ω–æ–º–µ—Ä–Ω–æ—Å—Ç–µ–π.

In [9]:
top_terms_df.head(10)

Unnamed: 0,cluster,count,top_terms
0,0,1506,"–ø—Ä–æ–¥–∞–∂–∞, –∫–ª–∏–µ–Ω—Ç, –ø–µ—Ä–µ–≥–æ–≤–æ—Ä—ã, –±–∞–∑–∞, –º–µ–Ω–µ–¥–∂–µ—Ä, –∫..."
1,1,1247,"–æ–±—ä–µ–∫—Ç, —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π, –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ü–∏—è, —Å—Ç—Ä–æ–∏—Ç–µ–ª—å–Ω—ã..."
2,2,2701,"–¥–µ–Ω—å, –∞–≤—Ç–æ–º–æ–±–∏–ª—å, –º–µ—Å—Ç–æ, –º–µ—Å—è—Ü, –ø–ª–∞—Ç–∞, —Ä–∞–±–æ—á–∏–π..."
3,3,2094,"–¥–æ–∫—É–º–µ–Ω—Ç, –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞, –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è, –≤–µ–¥–µ–Ω–∏–µ, –∑–Ω..."
4,4,1326,"—Å–º–µ–Ω–∞, –≥–æ—Å—Ç—å, –∑–∞–∫–∞–∑, —á–∞—Å, –ø–∏—Ç–∞–Ω–∏–µ, –±–µ—Å–ø–ª–∞—Ç–Ω—ã–π,..."
5,5,1767,"—Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞, –ø—Ä–æ–µ–∫—Ç, –∫–æ–º–∞–Ω–¥–∞, –ø–æ–Ω–∏–º–∞–Ω–∏–µ, –∑–∞–¥–∞—á–∞..."
6,6,114,"–Ω–µ–ø–æ–ª–Ω—ã–π —Ä–∞–±–æ—á–∏–π, –Ω–µ–ø–æ–ª–Ω—ã–π, –æ—Ç–¥—ã—Ö, –ø—Ä–æ–¥–∞–≤–µ—Ü –∫–∞..."
7,7,838,"–ø–∞—Ü–∏–µ–Ω—Ç, –º–µ–¥–∏—Ü–∏–Ω—Å–∫–∏–π, –∫–ª–∏–Ω–∏–∫–∞, —Å–ø–µ—Ü–∏–∞–ª—å–Ω–æ—Å—Ç—å, ..."
8,8,1931,"–Ω–∞—à, –∫–ª–∏–µ–Ω—Ç, –º—ã, —á—Ç–æ, –∫–æ–º–∞–Ω–¥–∞, –∫–æ—Ç–æ—Ä—ã–π, –µ—Å–ª–∏, ..."
9,9,482,"–±—É—Ö–≥–∞–ª—Ç–µ—Ä—Å–∫–∏–π, —É—á—ë—Ç, –±—É—Ö–≥–∞–ª—Ç–µ—Ä, –Ω–∞–ª–æ–≥–æ–≤—ã–π, –¥–æ–∫..."
