# Импорт библиотек

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

# Обработка датасета

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("/content/drive/MyDrive/spotify_albums.csv")

In [None]:
data.tail(15)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   album_name           200 non-null    object 
 1   artist               200 non-null    object 
 2   date                 200 non-null    object 
 3   duration             200 non-null    float64
 4   amount_of_element    200 non-null    int64  
 5   genre                200 non-null    object 
 6   popularity           200 non-null    int64  
 7   amount_of_listeners  200 non-null    int64  
 8   type_of_album        200 non-null    object 
 9   featured_artists     25 non-null     object 
 10  artist_followers     200 non-null    int64  
 11  label                200 non-null    object 
dtypes: float64(1), int64(4), object(7)
memory usage: 18.9+ KB


# Метод на основе похожести

In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


data['combined_features'] = (
    data['artist'].astype(str) + ' ' +
    data['date'].astype(str) + ' ' +
    data['duration'].astype(str) + ' ' +
    data['amount_of_element'].astype(str) + ' ' +
    data['genre'].astype(str) + ' ' +
    data['popularity'].astype(str) + ' ' +
    data['amount_of_listeners'].astype(str) + ' ' +
    data['type_of_album'].astype(str) + ' ' +
    data['featured_artists'].astype(str) + ' ' +
    data['artist_followers'].astype(str) + ' ' +
    data['label'].astype(str)
)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_albums(album_title):
    try:
        idx = data[data['album_name'] == album_title].index[0]
    except IndexError:
        return "Альбом не найден."

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_indices = [i[0] for i in sim_scores[1:6]]

    return data['album_name'].iloc[sim_indices].tolist()


recommended = recommend_albums('HOP')
print("Рекомендуемые альбомы:", recommended)


Рекомендуемые альбомы: ['ROCK-STAR', 'ATE', 'GIANT', 'rosie', 'Unorthodox Jukebox']


# KNN

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

# Создаем DataFrame
df = data

# Объединяем текстовые признаки в один столбец
df['text_features'] = df['genre'] + ' ' + df['artist'] + ' ' + df['type_of_album'] + ' ' + df['label']

# Векторизация текстовых данных
vectorizer = TfidfVectorizer()
text_matrix = vectorizer.fit_transform(df['text_features'])

# Нормализация числовых признаков
scaler = StandardScaler()
numeric_features = scaler.fit_transform(df[['duration', 'popularity', 'amount_of_element', 'artist_followers', 'amount_of_listeners']])

# Объединяем текстовые и числовые признаки и преобразуем в CSR формат
features = csr_matrix(hstack([text_matrix, numeric_features]))

# Обучаем модель k-NN
knn = NearestNeighbors(n_neighbors=10, metric='cosine')  # Используем косинусное расстояние
knn.fit(features)

# Функция для получения рекомендаций
def get_recommendations(album_name, knn_model=knn):
    # Находим индекс альбома
    idx = df[df['album_name'] == album_name].index
    if len(idx) == 0:
        return "Альбом не найден"

    idx = idx[0]

    # Получаем вектор признаков для этого альбома (используем CSR формат)
    album_features = features[idx, :].toarray()  # Преобразуем в плотный формат

    # Находим k ближайших соседей
    distances, indices = knn_model.kneighbors(album_features)

    # Возвращаем названия рекомендованных альбомов (исключая сам альбом)
    recommendations = df['album_name'].iloc[indices[0][1:]].tolist()
    return recommendations

# Пример использования
recommendations = get_recommendations('HOP')
print("Рекомендуемые альбомы:")
print(recommendations)

Рекомендуемые альбомы:
['ATE', 'ROCK-STAR', 'rosie', 'APT.', 'Good Luck, Babe!', 'GIANT', 'FERXXOCALIPSIS', "Short n' Sweet", 'BRAT']


# Автоэнкодер

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# === Загрузка данных ===
df = pd.read_csv("/content/drive/MyDrive/spotify_albums.csv")

# === Предобработка ===
df = df.drop(columns=["album_name", "featured_artists", "label", "date"])
# df["year"] = pd.to_datetime(df["date"]).dt.year
# df = df.drop(columns=["date"])

categorical_cols = ["artist", "genre", "type_of_album"]
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

scaler = StandardScaler()
X = scaler.fit_transform(df)

# === Создание датасета ===
X_tensor = torch.tensor(X, dtype=torch.float32)
dataset = TensorDataset(X_tensor)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

class Autoencoder(nn.Module):
    def __init__(self, input_dim, embed_dim=4):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 6),
            nn.ReLU(),
            nn.Linear(6, embed_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(embed_dim, 6),
            nn.ReLU(),
            nn.Linear(6, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

model = Autoencoder(X.shape[1], embed_dim=4)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# === Обучение ===
for epoch in range(100):
    for batch in dataloader:
        inputs = batch[0]
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# === Получение эмбеддингов ===
with torch.no_grad():
    embeddings = model.encoder(X_tensor).numpy()
import numpy as np



def recommend_by_album_list_advanced(album_names, top_n=5):
    # Найдём индексы указанных альбомов
    indices = data[data["album_name"].isin(album_names)].index.tolist()

    if not indices:
        raise ValueError("Ни один из указанных альбомов не найден.")

    # Чем ближе к последнему, тем больший вес (затухание слева направо)
    weights = np.exp(np.linspace(-1.0, 0.0, num=len(indices)))  # например: [0.37, 0.60, 1.0]
    weights /= weights.sum()  # нормализация

    # Взвешенное усреднение
    weighted_embedding = np.average(embeddings[indices], axis=0, weights=weights)

    # Косинусное сходство
    sims = cosine_similarity([weighted_embedding], embeddings)[0]

    # Исключим исходные альбомы
    for idx in indices:
        sims[idx] = -1

    # Получаем top-N
    top_indices = sims.argsort()[::-1][:top_n]

    return data.loc[top_indices, "album_name"].reset_index(drop=True)

# Пример:
recent_likes = ["Hurry Up Tomorrow", "Scorpion", "After Hours"]
recommendations = recommend_by_album_list_advanced(recent_likes)
print(recommendations)


0                   THE TORTURED POETS DEPARTMENT
1                         1989 (Taylor's Version)
2                1989 (Taylor's Version) [Deluxe]
3                                Starboy (Deluxe)
4    THE TORTURED POETS DEPARTMENT: THE ANTHOLOGY
Name: album_name, dtype: object


# TabNet

In [None]:
!pip install pytorch-tabnet

  Attempting uninstall: nvidia-cusolver-cu12
    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83
Successfully installed nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127 pytorch-tabnet-4.1.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.metrics.pairwise import cosine_similarity

# Загрузка данных


# Обработка данных
data = data.drop_duplicates(subset=["album_name", "artist"])
data["genre"] = data["genre"].fillna("Unknown")

# Кодируем категориальные признаки
cat_features = ["artist", "genre", "type_of_album", "label"]
for col in cat_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))

# Нормализуем числовые признаки
num_features = ["duration", "amount_of_element", "popularity", "amount_of_listeners", "artist_followers"]
scaler = MinMaxScaler()
data[num_features] = scaler.fit_transform(data[num_features])

# Создаем матрицу признаков
features = data.drop(columns=["album_name", "date", "featured_artists"])
album_names = data["album_name"].values

# Создаем искусственную целевую переменную (имитация рейтингов)
np.random.seed(42)
data["target"] = np.random.uniform(0, 1, size=len(data))

# Разделяем данные
X_train, X_test, y_train, y_test = train_test_split(
    features, data["target"], test_size=0.2, random_state=42
)

# Конвертируем в numpy
X_train = X_train.values
y_train = y_train.values.reshape(-1, 1)
X_test = X_test.values
y_test = y_test.values.reshape(-1, 1)

# Инициализация и обучение TabNet
tabnet = TabNetRegressor(
    n_d=32,
    n_a=32,
    n_steps=3,
    gamma=1.3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size": 10, "gamma": 0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type="sparsemax",
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)

tabnet.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['rmse'],
    max_epochs=30,
    patience=5,
    batch_size=256,
    virtual_batch_size=64,
    drop_last=False,
    augmentations=None
)

# Функция для получения рекомендаций
def get_tabnet_recommendations(liked_albums, n_recommendations=5):

    # Получаем индексы понравившихся альбомов
    liked_indices = [np.where(album_names == album)[0][0] for album in liked_albums
                    if album in album_names]

    if not liked_indices:
        return pd.DataFrame(columns=["album_name", "artist", "genre", "popularity"])

    # Получаем предсказания для всех альбомов
    all_predictions = tabnet.predict(features.values)

    # Усредняем предсказания понравившихся альбомов
    avg_prediction = np.mean(all_predictions[liked_indices])

    # Вычисляем "схожесть" как обратное расстояние до среднего
    distances = np.abs(all_predictions - avg_prediction).flatten()
    similarities = 1 / (1 + distances)  # Преобразуем расстояния в схожести

    # Сортируем по схожести, исключая понравившиеся
    sorted_indices = np.argsort(similarities)[::-1]
    recommended_indices = [i for i in sorted_indices
                          if i not in liked_indices][:n_recommendations]

    # Формируем результат
    recommendations = data.iloc[recommended_indices][
        ["album_name", "artist", "genre", "popularity"]]
    return recommendations

# Пример использования
liked_albums = ["Hurry Up Tomorrow", "Scorpion", "After Hours"]
recommendations = get_tabnet_recommendations(liked_albums)
print("Рекомендуемые альбомы:")
print(recommendations)



epoch 0  | loss: 2.81496 | val_0_rmse: 52.3016 |  0:00:00s
epoch 1  | loss: 0.69405 | val_0_rmse: 40.69386|  0:00:00s
epoch 2  | loss: 1.2221  | val_0_rmse: 10.73387|  0:00:00s
epoch 3  | loss: 1.05652 | val_0_rmse: 13.11426|  0:00:00s
epoch 4  | loss: 0.41439 | val_0_rmse: 13.73422|  0:00:00s
epoch 5  | loss: 0.3133  | val_0_rmse: 12.60007|  0:00:00s
epoch 6  | loss: 0.48972 | val_0_rmse: 11.35278|  0:00:00s
epoch 7  | loss: 0.34577 | val_0_rmse: 11.00366|  0:00:01s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_rmse = 10.73387
Рекомендуемые альбомы:
                  album_name  artist  genre  popularity
11                  Her Loss      37     10    1.000000
27     Unapologetic (Deluxe)      56      0    0.882353
125  Funk Wav Bounces Vol. 2      21     16    0.764706
172                      HOP      17      5    0.764706
148                    DRIVE      25     16    0.705882




# Метрики

3. Cosine Similarity (Косинусное сходство)

Оценивает, насколько схожи рекомендуемые альбомы с целевым альбомом.

Чем выше значение (ближе к 1), тем лучше рекомендация.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(tfidf_matrix, tfidf_matrix)

4. Euclidean Distance (Евклидово расстояние)

Оценивает "расстояние" между альбомами в векторном пространстве.

Чем меньше расстояние, тем более схожи альбомы.

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
distances = euclidean_distances(tfidf_matrix, tfidf_matrix)

5. Diversity

Измеряет, насколько разнообразны рекомендации

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

def diversity_score(dataset, user_likes, recommended_album, feature="genre"):
    # Получаем жанры понравившихся и рекомендованного альбомов
    liked_genres = dataset.loc[user_likes, feature].values
    recommended_genre = dataset.loc[recommended_album, feature]

    # Кодируем жанры в бинарные векторы
    encoder = OneHotEncoder(sparse=False)
    all_genres = np.append(liked_genres, recommended_genre).reshape(-1, 1)
    encoded = encoder.fit_transform(all_genres)

    # Считаем косинусное сходство
    avg_similarity = cosine_similarity(encoded[:-1], [encoded[-1]]).mean()
    return 1 - avg_similarity  # Чем выше, тем разнообразнее

# Пример:
diversity = diversity_score(dataset, ["Thriller", "Back in Black"], "Nevermind")
print(f"Diversity: {diversity:.2f}")