In [1]:
import pandas as pd
import spacy 
import nltk
nltk.download('punkt')
nlp = spacy.load('ru_core_news_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\misha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
namesTeachers = pd.read_csv('data/namesTeachers.csv')
namesStudents = pd.read_csv('data/namesStudents.csv')
namesCombined = pd.read_csv('data/namesCombined.csv')

In [3]:
namesTeachers.head(5)

Unnamed: 0,Names,Meaning,Vectors
0,Юлия,Учтивая,
1,Эльмира,Принцесса,
2,Светлана,Светлая,
3,Екатерина,Чистая,
4,Виктория,Завоевательница,


In [4]:
namesStudents.head(5)
namesStudents.info

<bound method DataFrame.info of            Names      Meaning  Vectors
0          Мукам      Мелодия      NaN
1           Юлия      Учтивая      NaN
2         Полина    Маленькая      NaN
3       Ангелина   Ангельская      NaN
4     Александра    Защитница      NaN
...          ...          ...      ...
1466   Анастасия  Воскрешение      NaN
1467        Юлия      Учтивая      NaN
1468   Екатерина       Чистая      NaN
1469       Ирина          Мир      NaN
1470       Елена      Зеленая      NaN

[1471 rows x 3 columns]>

In [5]:
namesCombined.head(5)
namesCombined.info

<bound method DataFrame.info of           Names          Meaning  Vectors
0         Юлия           Учтивая      NaN
1       Эльмира        Принцесса      NaN
2      Светлана          Светлая      NaN
3     Екатерина           Чистая      NaN
4      Виктория  Завоевательница      NaN
...         ...              ...      ...
1597  Анастасия      Воскрешение      NaN
1598       Юлия          Учтивая      NaN
1599  Екатерина           Чистая      NaN
1600      Ирина              Мир      NaN
1601      Елена          Зеленая      NaN

[1602 rows x 3 columns]>

In [6]:
def tokenize_data(data):
    # Применение токенизации к столбцу "Meaning"
    tokenized_meanings = []
    for text in data["Meaning"]:
        doc = nlp(text)
        tokens = [token.text for token in doc if not token.is_punct]  # Получение токенов без пунктуации
        tokenized_meanings.append(tokens)

    # Добавление токенизированных данных в DataFrame
    data["Tokenized_Meaning"] = tokenized_meanings
    return data

# Пример использования функции
colName = 'Meaning'
namesTeachers = tokenize_data(namesTeachers)
namesStudents = tokenize_data(namesStudents)
namesCombined = tokenize_data(namesCombined)
namesCombined.head(10)

Unnamed: 0,Names,Meaning,Vectors,Tokenized_Meaning
0,Юлия,Учтивая,,[Учтивая]
1,Эльмира,Принцесса,,[Принцесса]
2,Светлана,Светлая,,[Светлая]
3,Екатерина,Чистая,,[Чистая]
4,Виктория,Завоевательница,,[Завоевательница]
5,Инесса,Чистая,,[Чистая]
6,Галина,Спокойная,,[Спокойная]
7,Ануш,Дыхание утра,,"[Дыхание, утра]"
8,Татьяна,Повелительница,,[Повелительница]
9,Янина,Правительница,,[Правительница]


In [7]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
# Обучение моделей для каждого источника
model_students = Word2Vec(sentences=namesStudents['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
model_teachers = Word2Vec(sentences=namesTeachers['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
model_combined = Word2Vec(sentences=namesCombined['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

model_students.save("word2vec_students.model")
model_teachers.save("word2vec_teachers.model")
model_combined.save("word2vec_combined.model")


In [8]:
from gensim.models import Word2Vec
import numpy as np

def vectorize_meanings(data, model_path):
    vectorized_meanings = []
    # Загрузка ранее сохраненной модели Word2Vec
    model = Word2Vec.load(model_path)
    # Векторизация столбца "Tokenized_Meaning"
    for tokens in data["Tokenized_Meaning"]:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        mean_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
        vectorized_meanings.append(mean_vector)
    # Добавление векторизованных значений обратно в DataFrame
    data["Vectors"] = vectorized_meanings
    return data

# Векторизация данных с использованием функции vectorize_meanings
model_students_path = "word2vec_students.model"
model_teachers_path = "word2vec_teachers.model"  
model_combined_path = "word2vec_combined.model"# Путь к обученной модели Word2Vec
namesStudents = vectorize_meanings(namesStudents, model_students_path)
namesTeachers = vectorize_meanings(namesTeachers, model_teachers_path)
namesCombined = vectorize_meanings(namesCombined, model_combined_path)


In [9]:
namesStudents.head(5)

Unnamed: 0,Names,Meaning,Vectors,Tokenized_Meaning
0,Мукам,Мелодия,"[0.00546654, 0.00061904313, -0.0034840333, -0....",[Мелодия]
1,Юлия,Учтивая,"[0.0076966463, 0.009120642, 0.0011355019, -0.0...",[Учтивая]
2,Полина,Маленькая,"[0.009770293, 0.008165114, 0.0012809718, 0.005...",[Маленькая]
3,Ангелина,Ангельская,"[-0.0049735666, -0.0012833046, 0.0032806373, -...",[Ангельская]
4,Александра,Защитница,"[-0.00950012, 0.009562223, -0.0077707553, -0.0...",[Защитница]


In [10]:
first_vector = namesTeachers['Vectors'].iloc[1]
print(first_vector)


[-1.5110135e-03 -4.0345038e-03 -4.3988526e-03 -4.6293857e-03
 -5.5957139e-03 -5.3018034e-03 -8.0223028e-03  9.5188450e-03
  6.3990653e-03 -3.6066938e-03  2.4784422e-03 -7.6424945e-03
  7.5231018e-03  8.3047356e-03  7.8943017e-04 -6.8329908e-03
 -2.9577280e-03  4.7339690e-03 -2.9393840e-03  3.1764733e-03
  9.4087161e-03  4.3531060e-03 -5.1454795e-03  5.4670931e-03
 -2.8791917e-03 -6.3939407e-03  7.0078014e-03 -9.2539331e-03
 -1.1306572e-03 -1.3853407e-03 -8.4159467e-03 -1.1023998e-03
  5.5972575e-03 -5.2676522e-03 -7.0261359e-03  6.2002684e-03
 -3.4264398e-03 -7.8195557e-03  2.9337048e-04 -4.9802064e-05
  7.5409068e-03  5.6930183e-04  7.8824591e-03 -9.8127816e-03
  4.0332675e-03  6.2181102e-03  1.0294437e-03 -2.8364109e-03
  1.0266149e-03 -8.3912135e-04 -8.1082694e-03 -6.7834724e-03
  1.2001550e-03 -2.0178736e-03  9.5494185e-03 -9.4942690e-04
  6.1186957e-03  2.0461380e-03  8.0996891e-03 -9.2905201e-03
  2.3927295e-03 -1.4512575e-03  2.1549999e-03 -2.1979929e-04
 -6.6406308e-03  9.05045

In [11]:
first_vector = namesCombined['Vectors'].iloc[1]
print(first_vector)


[-8.6070541e-03  1.4312613e-03 -7.8807659e-03  9.2400433e-03
  7.2043585e-03 -2.6581525e-03 -8.9664413e-03  9.2872204e-03
 -1.0426998e-04 -9.1319683e-04 -6.7897975e-03 -4.8669255e-03
 -9.4631743e-03  2.4907386e-03  5.9744096e-03  6.8250690e-03
  9.8563135e-03 -7.9682739e-03  6.4180134e-04  8.2653770e-03
 -9.0919184e-03 -2.1182585e-03  8.5125044e-03 -2.1128582e-03
  7.0351601e-04  3.9072083e-03 -3.3959866e-04  6.4460253e-03
  2.5460171e-03 -7.7768443e-03 -1.6071808e-03 -1.3810945e-03
  7.6249898e-03  4.7120657e-03  4.5867776e-03 -9.5034955e-04
  5.6848908e-03  9.5090810e-03 -5.3795958e-03  2.9081630e-03
 -6.6853403e-03 -7.1139717e-03 -3.2711958e-03 -7.8544701e-03
 -6.7872810e-03 -1.4167703e-03 -9.3766991e-03  1.3725758e-03
  8.2639614e-03 -1.0502088e-03 -1.0835599e-03  6.9091297e-03
  6.7362762e-03  2.2448457e-03 -2.1960770e-03  8.6231399e-03
 -7.2124777e-03  8.4788678e-03  1.3126064e-03  7.1367860e-04
  9.8879375e-03 -2.1394312e-03 -8.1892656e-03 -6.3853706e-03
 -7.5927987e-03  4.04898

In [12]:
first_vector = namesStudents['Tokenized_Meaning'].iloc[1239]
print(first_vector)


['Сошёл', 'с', 'Юпитера']


In [13]:
namesCombined.head(10)

Unnamed: 0,Names,Meaning,Vectors,Tokenized_Meaning
0,Юлия,Учтивая,"[-0.0019442164, -0.0052675214, 0.009447114, -0...",[Учтивая]
1,Эльмира,Принцесса,"[-0.008607054, 0.0014312613, -0.007880766, 0.0...",[Принцесса]
2,Светлана,Светлая,"[0.007088797, -0.00156793, 0.007947499, -0.009...",[Светлая]
3,Екатерина,Чистая,"[-0.0086196875, 0.003665738, 0.0051898835, 0.0...",[Чистая]
4,Виктория,Завоевательница,"[-0.0095785465, 0.008943115, 0.0041650687, 0.0...",[Завоевательница]
5,Инесса,Чистая,"[-0.0086196875, 0.003665738, 0.0051898835, 0.0...",[Чистая]
6,Галина,Спокойная,"[0.004244654, 0.000394243, -0.00090871094, -0....",[Спокойная]
7,Ануш,Дыхание утра,"[-0.005371564, 0.007841383, 0.0040823496, -0.0...","[Дыхание, утра]"
8,Татьяна,Повелительница,"[-0.0025087667, -0.0059026573, 0.007483337, -0...",[Повелительница]
9,Янина,Правительница,"[0.0033520197, -0.00897635, -0.00053438783, 0....",[Правительница]


In [14]:
import numpy as np
def k_means(data, k):
    # Инициализация центроид
    centroids = data[np.random.choice(range(len(data)), size=k)]

    while True:
        # Выделение кластеров
        clusters = [[] for i in range(k)]
        for point in data:
            distances = [np.linalg.norm(point - centroid) for centroid in centroids]
            closest_centroid_idx = np.argmin(distances)
            clusters[closest_centroid_idx].append(point)

        # Обновление центроид
        new_centroids = []
        for cluster in clusters:
            if len(cluster) > 0:
                cluster_mean = np.mean(cluster, axis=0)
                new_centroids.append(cluster_mean)

        # Проверка на сходимость
        if np.allclose(centroids, new_centroids):
            break

        centroids = new_centroids

    return centroids

# Пример использования
students = np.array(namesStudents['Vectors'].tolist(), dtype=float)
teachers = np.array(namesTeachers['Vectors'].tolist(), dtype=float)
combination = np.array(namesCombined['Vectors'].tolist(), dtype=float)
k = 1
centroids_students = k_means(students, k)
centroids_teachers = k_means(teachers, k)
centroids_comb = k_means(combination, k)
print("Центроида у студентов:", centroids_students)
print("Центроида у учителей: ", centroids_teachers)
print("Центроида у всех: ", centroids_comb)



Центроида у студентов: [array([-3.57256627e-04,  7.58760114e-04,  7.53282905e-04,  1.06350140e-03,
        9.87178239e-05, -1.14639357e-03,  1.12097246e-03,  2.10727382e-03,
       -1.40241626e-03, -1.29214997e-03,  3.82834053e-04, -1.44652387e-03,
       -3.41376784e-04,  1.02973940e-03,  1.01141442e-04, -1.25958698e-04,
        1.48190799e-03,  3.14938178e-04, -9.69087997e-04, -2.89476963e-03,
        8.24586211e-04,  2.89617856e-04,  1.85925882e-03, -8.06294053e-04,
       -1.00807949e-04,  6.52753859e-04, -1.32751019e-03,  4.39672375e-04,
       -7.67681176e-04,  4.54223988e-04,  1.02001795e-03, -5.64725244e-04,
        1.09050172e-03, -1.65787585e-03, -5.08439666e-04,  6.42973443e-04,
        7.25710581e-04, -1.25239368e-04, -5.61495195e-04, -7.61284969e-04,
        5.04561401e-04, -9.60627911e-04, -8.45880308e-04,  6.15924569e-04,
        6.20162482e-04, -5.06020216e-04, -9.69128091e-04, -5.93243777e-04,
        8.36332532e-04,  6.25366289e-04,  1.54863664e-04, -9.81642398e-04,
 

In [15]:
def find_closest_vector(centroid, vectors):
    min_distance = float('inf')
    closest_vector = None
    
    for vector in vectors:
        distance = np.linalg.norm(centroid - vector)
        
        if distance < min_distance:
            min_distance = distance
            closest_vector = vector
    
    return closest_vector

vectorS = namesStudents['Vectors']
vectorT = namesTeachers['Vectors']
vectorC = namesCombined['Vectors']
closest_vector_students = find_closest_vector(centroids_students, vectorS)
closest_vector_teachers = find_closest_vector(centroids_teachers, vectorT)
closest_vector_combined = find_closest_vector(centroids_comb, vectorC)
print("Ближайщий вектор у студентов:", closest_vector_students)
print("Ближайщий вектор у учителей:", closest_vector_teachers)
print("Ближайщий вектор у всех:", closest_vector_combined)




Ближайщий вектор у студентов: [ 1.8852683e-03  1.3845739e-03 -1.8771401e-03  3.2288644e-05
 -1.8332461e-03  1.8885025e-03  5.8439653e-04 -2.3180763e-03
 -1.1639193e-03 -4.5960888e-04  3.5643436e-03  3.8690236e-04
 -1.5926991e-03 -3.8353843e-04  1.6674935e-03 -9.6083869e-04
 -8.0266306e-03  4.2468985e-03 -1.4148630e-03 -2.8076179e-03
  1.0812014e-03  2.9134806e-04  2.6720138e-03 -4.1239453e-03
 -7.0065208e-04  3.8017230e-03  2.7091519e-03 -2.4825430e-03
  4.0513952e-03 -3.8173071e-03 -4.7735893e-03  1.1113222e-03
 -1.4934499e-03 -5.9871259e-04  4.0888172e-04  4.9043231e-04
  3.1391315e-05 -9.4218337e-04 -4.3934868e-03 -4.1284211e-04
  5.8218217e-03  4.1264337e-03 -4.2217001e-03  7.2057941e-03
  5.4597278e-04 -1.9657530e-03 -1.1869214e-03 -5.6416419e-04
  5.4984861e-03 -6.3793040e-03 -5.9794734e-04 -5.1321588e-03
  3.9271009e-03 -3.9579207e-03  9.9685777e-04 -6.3020381e-04
 -4.6649599e-03 -5.5697598e-03  2.0449497e-03  3.6952104e-03
  4.3618814e-03  4.2730826e-03 -4.9837673e-04 -6.984164

In [16]:
decodedStudent = closest_vector_students
similar_word = model_students.wv.most_similar(positive=[closest_vector_students], topn=1)
print('Ближайший токен:', similar_word[0][0])

Ближайший токен: с


In [17]:
decodedTeacher = closest_vector_teachers
similarW2 = model_teachers.wv.most_similar(positive=[closest_vector_teachers], topn=1)
print('Ближайший токен:', similarW2[0][0])

Ближайший токен: поток


In [18]:
decodedCombined = closest_vector_combined
similarW1 = model_combined.wv.most_similar(positive=[decodedCombined], topn=1)
print('Ближайший токен:',similarW1[0][0])

Ближайший токен: Бога


##### ФАМИЛИИ

In [19]:
import pandas as pd
import spacy 
import nltk
nltk.download('punkt')
nlp = spacy.load('ru_core_news_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\misha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
surnameTeachers = pd.read_csv('data/Surnames_teachers.csv')
surnameStudents = pd.read_csv('data/Surnames_students.csv')
surnameStudents.head(5)

Unnamed: 0,surname,Meaning
0,Айдогдыев,благородный
1,Бомбина,жабка
2,Боссерт,медвежья смелость
3,Варченко,варить
4,Виницкая,от города Винница на Украине


In [21]:
import nltk
from nltk.tokenize import word_tokenize

# Предположим, что у вас есть столбец "Meaning" в виде списка строк
meanings = surnameStudents['Meaning']

# Цикл для токенизации каждого элемента в столбце "Meaning"
tokenized_meanings = []
for meaning in meanings:
    tokens = word_tokenize(meaning)
    tokenized_meanings.append(tokens)

surnameStudents['Tokenized_Meaning'] = tokenized_meanings
surnameStudents['Tokenized_Meaning'].head(10)



0                         [благородный]
1                               [жабка]
2                  [медвежья, смелость]
3                              [варить]
4    [от, города, Винница, на, Украине]
5                          [крестьянин]
6             [покровитель, земледелия]
7                          [земледелец]
8                             [здешний]
9                  [добрый, победитель]
Name: Tokenized_Meaning, dtype: object

In [22]:
import nltk
from nltk.tokenize import word_tokenize

# Предположим, что у вас есть столбец "Meaning" в виде списка строк
meanings_teachers = surnameTeachers['meaning']

# Цикл для токенизации каждого элемента в столбце "Meaning"
tokenized_meanings_teachers = []
for meaning in meanings_teachers:
    tokens = word_tokenize(meaning)
    tokenized_meanings_teachers.append(tokens)

surnameTeachers['Tokenized_Meaning'] = tokenized_meanings_teachers
surnameTeachers['Tokenized_Meaning'].head(10)

0    [аппаратчик, азотирования]
1                 [воспитанник]
2                  [счастливый]
3                  [земледелец]
4                      [кривой]
5                        [кожа]
6                  [божий, дар]
7                     [владыка]
8                [богоподобный]
9                       [мороз]
Name: Tokenized_Meaning, dtype: object

In [23]:
surnameTeachers.head(10
)

Unnamed: 0,surname,meaning,Tokenized_Meaning
0,Вербоватая,аппаратчик азотирования,"[аппаратчик, азотирования]"
1,Годованная,воспитанник,[воспитанник]
2,Евтушенко,счастливый,[счастливый]
3,Егорова,земледелец,[земледелец]
4,Кривкова,кривой,[кривой]
5,Кужелева,кожа,[кожа]
6,Матвеева,божий дар,"[божий, дар]"
7,Меликян,владыка,[владыка]
8,Михеева,богоподобный,[богоподобный]
9,Морозова,мороз,[мороз]


##### Векторизация

In [24]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
# Обучение моделей для каждого источника
model_students_surnames = Word2Vec(sentences=surnameStudents['Tokenized_Meaning'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
model_students_surnames.save("word2vec_surnames_students.model")

model_teachers_surname = Word2Vec(sentences=surnameTeachers['Tokenized_Meaning'].to_list(), vector_size=100, window=5, min_count=1, workers=4)
model_teachers_surname.save("word2vec_surnames_teachers.model")

In [25]:
from gensim.models import Word2Vec
import numpy as np

def vectorize_meanings(data, model_path):
    vectorized_meanings = []
    # Загрузка ранее сохраненной модели Word2Vec
    model = Word2Vec.load(model_path)
    # Векторизация столбца "Tokenized_Meaning"
    for tokens in data["Tokenized_Meaning"]:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        mean_vector = np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
        vectorized_meanings.append(mean_vector)
    # Добавление векторизованных значений обратно в DataFrame
    data["Vectors"] = vectorized_meanings
    return data

teachers_surname_path = 'word2vec_surnames_teachers.model'
students_surname_path = 'word2vec_surnames_students.model'


surnameStudents = vectorize_meanings(surnameStudents, students_surname_path)
namesTeachers = vectorize_meanings(surnameTeachers, teachers_surname_path)
surnameTeachers.head(5)

Unnamed: 0,surname,meaning,Tokenized_Meaning,Vectors
0,Вербоватая,аппаратчик азотирования,"[аппаратчик, азотирования]","[-0.004757017, 0.0001124572, -0.0012025877, -0..."
1,Годованная,воспитанник,[воспитанник],"[0.0051208558, -0.004418019, -0.009191075, -0...."
2,Евтушенко,счастливый,[счастливый],"[-0.002607931, -0.00791861, -0.008259738, -0.0..."
3,Егорова,земледелец,[земледелец],"[-0.0019442164, -0.0052675214, 0.009447114, -0..."
4,Кривкова,кривой,[кривой],"[-0.008216973, 0.00016809226, 0.0060254214, 0...."


In [37]:
model_teachers.wv.most_similar(positive=["счастливый"])

KeyError: "Key 'счастливый' not present in vocabulary"

In [26]:
import numpy as np
def k_means(data, k):
    # Инициализация центроид
    centroids = data[np.random.choice(range(len(data)), size=k)]

    while True:
        # Выделение кластеров
        clusters = [[] for i in range(k)]
        for point in data:
            distances = [np.linalg.norm(point - centroid) for centroid in centroids]
            closest_centroid_idx = np.argmin(distances)
            clusters[closest_centroid_idx].append(point)

        # Обновление центроид
        new_centroids = []
        for cluster in clusters:
            if len(cluster) > 0:
                cluster_mean = np.mean(cluster, axis=0)
                new_centroids.append(cluster_mean)

        # Проверка на сходимость
        if np.allclose(centroids, new_centroids):
            break

        centroids = new_centroids

    return centroids

# Пример использования
students1 = np.array(surnameStudents['Vectors'].tolist(), dtype=float)
teachers1 = np.array(surnameTeachers['Vectors'].tolist(), dtype=float)
k = 1
centroids_students_sur = k_means(students1, k)
centroids_teachers_sur = k_means(teachers1, k)
print("Центроида у студентов:", centroids_students_sur)
print("Центроида у учителей: ", centroids_teachers_sur)




Центроида у студентов: [array([-4.51964676e-04,  5.86731765e-04,  1.01382923e-04,  7.96587748e-05,
        2.47058234e-04, -5.23447045e-04,  2.90837915e-04,  9.03163417e-04,
       -3.11871573e-04, -1.59639476e-04, -2.15237705e-04, -3.97762795e-04,
       -1.55609020e-04,  1.63260241e-04,  2.68173349e-04, -3.34677220e-04,
        7.45379848e-05, -1.64389725e-04, -2.47225675e-04, -7.02821203e-04,
        1.66835729e-04, -5.19438929e-05,  4.52530163e-04, -1.03546355e-04,
       -3.83633787e-05,  8.70538766e-05, -1.84152194e-04, -3.43702786e-05,
       -2.83674648e-04,  1.58428983e-04,  4.82120402e-04, -2.34172924e-04,
       -6.14855453e-06, -2.80112125e-04, -6.54407890e-05,  3.70544079e-04,
        2.96064135e-04, -1.55389974e-04, -3.28034520e-04, -3.71452673e-04,
        4.21194259e-06, -2.89328574e-04, -2.42306191e-04,  2.88813389e-05,
        2.74195252e-04, -1.07156381e-04, -8.61241304e-05, -4.97721527e-05,
        9.95583331e-05,  2.47177891e-04,  2.66058829e-04, -2.93495414e-04,
 

In [27]:
def find_closest_vector(centroid, vectors):
    min_distance = float('inf')
    closest_vector = None
    
    for vector in vectors:
        distance = np.linalg.norm(centroid - vector)
        
        if distance < min_distance:
            min_distance = distance
            closest_vector = vector
    
    return closest_vector

vectorS1 = surnameStudents['Vectors']
vectorT1 = surnameTeachers['Vectors']
closest_vector_students1 = find_closest_vector(centroids_students_sur, vectorS1)
closest_vector_teachers1 = find_closest_vector(centroids_teachers_sur, vectorT1)

print("Ближайщий вектор у студентов:", closest_vector_students1)
print("Ближайщий вектор у учителей:", closest_vector_teachers1)

Ближайщий вектор у студентов: [ 2.3553551e-03  1.1626915e-03 -3.6295615e-03  1.1442337e-03
  4.6584569e-03  3.6700629e-03  8.8852248e-04 -1.2984941e-03
 -9.2636608e-04 -2.0936343e-03  1.5657144e-03 -1.9684085e-05
 -1.5179721e-03 -2.1724883e-03 -1.9294607e-03  2.0991545e-05
  6.5598986e-03 -1.0522529e-03  5.0850199e-03  7.1977184e-04
  1.8495995e-03 -5.8001222e-04 -1.3809062e-03 -5.1197992e-04
  3.1702599e-05  1.0014039e-03 -1.4178661e-03  1.7382982e-03
 -2.5255668e-03 -5.1470250e-03 -1.6338378e-04  2.6506044e-03
 -1.7472063e-03  1.0035366e-03  1.7927063e-03 -1.7643054e-03
 -2.2943076e-03 -3.2554136e-03  8.8437146e-04 -1.3533075e-03
 -1.9278240e-03  3.0517015e-03 -4.7010574e-03  2.2604910e-03
  1.0272432e-03  1.4358935e-03  5.2711335e-03  7.8808074e-04
 -3.3826584e-03 -1.7814713e-03 -9.8462286e-04 -2.0470731e-03
 -1.2928543e-03  5.4544094e-04  4.2808051e-03  1.9254361e-03
  3.8095280e-03  1.6135117e-04 -4.0099937e-03  4.0322990e-04
 -3.4930739e-03 -3.2376042e-03 -3.5305845e-03 -2.190613

In [28]:
decodedStudentSur = closest_vector_students1
similar_word_sur_s = model_students_surnames.wv.most_similar(positive=[closest_vector_students1], topn=1)
print('Ближайший токен:', similar_word_sur_s[0][0])

Ближайший токен: дуб


In [29]:
decodedTeacherSur = closest_vector_teachers1
similar_word_sur_t = model_teachers_surname.wv.most_similar(positive=[closest_vector_teachers1], topn=1)
print('Ближайший токен:', similar_word_sur_t[0][0])

Ближайший токен: охотник
