In [20]:
import os
import warnings
import pandas as pd
import numpy as np
import umap

from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

warnings.filterwarnings("ignore")

RAW_DATA_PATH = '../data/raw/'
SUBMISSIONS_PATH = '../submissions/'

os.makedirs(SUBMISSIONS_PATH, exist_ok=True)

## Настройка и Загрузка

In [21]:
try:
    train_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train_set.csv'))
    test_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_set.csv'))
    print(f"Размер тренировочных данных: {train_df.shape}")
    print(f"Размер тестовых данных: {test_df.shape}")
except FileNotFoundError:
    print("Ошибка: Убедитесь, что файлы train_set.csv и test_set.csv находятся в папке ../data/raw/")

Размер тренировочных данных: (2064, 2)
Размер тестовых данных: (888, 1)


In [22]:
url = "https://raw.githubusercontent.com/pyrfume/pyrfume-data/refs/heads/main/leffingwell/behavior.csv"
save_dir = os.path.join(RAW_DATA_PATH)

os.system(f"wget -O {save_dir}/behavior.csv {url}")

sh: wget: command not found


32512

In [23]:
behavior_df  = pd.read_csv(os.path.join(RAW_DATA_PATH, 'behavior.csv'))

In [24]:
common_values = set(test_df['Pubchem_ID']).intersection(behavior_df['Stimulus'])
print(len(common_values))
common_values = set(train_df['Pubchem_ID']).intersection(behavior_df['Stimulus'])
print(len(common_values))

888
2064


In [25]:
train_df = behavior_df[behavior_df['Stimulus'].isin(train_df['Pubchem_ID'])]
train_df = train_df.drop(columns=['Stimulus'])
test_df = behavior_df[behavior_df['Stimulus'].isin(test_df['Pubchem_ID'])]
test_df = test_df.drop(columns=['Stimulus'])

In [26]:
df = pd.concat([train_df, test_df], ignore_index=True)

In [27]:
X = df.select_dtypes(include=["number"])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA на 3 компоненты
pca_3 = PCA(n_components=3, random_state=42)
X_pca_3 = pca_3.fit_transform(X_scaled)

kmeans_3 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_3d = kmeans_3.fit_predict(X_pca_3)

# PCA на 2 компоненты
pca_2 = PCA(n_components=2, random_state=42)
X_pca_2 = pca_2.fit_transform(X_scaled)

kmeans_2 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_2d = kmeans_2.fit_predict(X_pca_2)

In [28]:
df["taste_cluster"] = labels_3d

# Сохраняем результат
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, 'submission_pca3.csv'), index=False)

In [29]:
df["taste_cluster"] = labels_2d

# Сохраняем результат
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, 'submission_pca2.csv'), index=False)

### t-sne

In [30]:
# t-SNE на 3 компоненты
tsne_3 = TSNE(
    n_components=3,
    random_state=42,
    init="random",
    learning_rate="auto",
    perplexity=30,
)
X_tsne_3 = tsne_3.fit_transform(X_scaled)

kmeans_tsne3 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_tsne_3d = kmeans_tsne3.fit_predict(X_tsne_3)

# t-SNE на 2 компоненты
tsne_2 = TSNE(
    n_components=2,
    random_state=42,
    init="random",
    learning_rate="auto",
    perplexity=30,
)
X_tsne_2 = tsne_2.fit_transform(X_scaled)

kmeans_tsne2 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_tsne_2d = kmeans_tsne2.fit_predict(X_tsne_2)

In [31]:
# Сохраняем результат
df["taste_cluster"] = labels_tsne_3d
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, "submission_tsne3.csv"), index=False)

In [32]:
# Сохраняем результат
df["taste_cluster"] = labels_tsne_2d
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, "submission_tsne2.csv"), index=False)

### umap

In [33]:
# UMAP на 3 компоненты
umap_3 = umap.UMAP(
    n_components=3,
    n_neighbors=30,
    min_dist=0.1,
    metric="euclidean",
    random_state=42,
)
X_umap_3 = umap_3.fit_transform(X_scaled)

kmeans_umap3 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_umap_3d = kmeans_umap3.fit_predict(X_umap_3)

# UMAP на 2 компоненты
umap_2 = umap.UMAP(
    n_components=2,
    n_neighbors=30,
    min_dist=0.1,
    metric="euclidean",
    random_state=42,
)
X_umap_2 = umap_2.fit_transform(X_scaled)

kmeans_umap2 = KMeans(n_clusters=8, random_state=42, n_init=10)
labels_umap_2d = kmeans_umap2.fit_predict(X_umap_2)

In [34]:
# Сохраняем результат
df["taste_cluster"] = labels_umap_3d
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, "submission_umap3.csv"), index=False)

In [35]:
# Сохраняем результат
df["taste_cluster"] = labels_umap_2d
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, "submission_umap2.csv"), index=False)

### jaccard

In [36]:
X = train_df
X_binary = X.astype(bool)
y = test_df
y_binary = y.astype(bool)

df = pd.concat([X_binary, y_binary], ignore_index=True)

In [37]:
distances = pairwise_distances(df.to_numpy(), metric="jaccard")

# Кластеризация
agg_model = AgglomerativeClustering(n_clusters=8, linkage='average', distance_threshold=None)
labels = agg_model.fit_predict(distances)

In [38]:
# Добавим метки кластеров к df
df["taste_cluster"] = labels

# Сохраняем результат
y = df["taste_cluster"].iloc[2064:]
y.to_csv(os.path.join(SUBMISSIONS_PATH, 'submission_j.csv'), index=False)