In [1]:
import os
import warnings

import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

warnings.filterwarnings("ignore")

RAW_DATA_PATH = '../data/raw/'
SUBMISSIONS_PATH = '../submissions/'

os.makedirs(SUBMISSIONS_PATH, exist_ok=True)

In [2]:
try:
    train_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_set.csv'))
    test_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_set.csv'))
    print(f"Размер тренировочных данных: {train_df.shape}")
    print(f"Размер тестовых данных: {test_df.shape}")
except FileNotFoundError:
    print("Ошибка: Убедитесь, что файлы train_set.csv и test_set.csv находятся в папке ../data/raw/")

Размер тренировочных данных: (2064, 2)
Размер тестовых данных: (888, 1)


In [3]:
behavior_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'behavior.csv'))

In [4]:
common_values = set(test_df['Pubchem_ID']).intersection(behavior_df['Stimulus'])
print(len(common_values))
common_values = set(train_df['Pubchem_ID']).intersection(behavior_df['Stimulus'])
print(len(common_values))

888
2064


In [5]:
df = behavior_df.drop(columns=['Stimulus']).astype(bool)

distances = pairwise_distances(df.to_numpy(), metric="jaccard")

# Кластеризация
agg_model = AgglomerativeClustering(n_clusters=8, linkage='average', distance_threshold=None)
labels = agg_model.fit_predict(distances)

# Добавим метки кластеров к df
behavior_df["taste_cluster"] = labels

In [6]:
order = test_df['Pubchem_ID']
y_new = behavior_df[behavior_df['Stimulus'].isin(order)]
y_new = y_new.set_index('Stimulus').loc[order].reset_index()


In [7]:
# Сохраняем результат
y = y_new["taste_cluster"]
y.to_csv(os.path.join(SUBMISSIONS_PATH, 'solution.csv'), index=False)