# Imports

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import seaborn as sns
import google.colab
import numpy as np
from sklearn.feature_selection import mutual_info_regression
import scipy.stats as stats
from sklearn.feature_selection import f_classif

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
artists_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/Music/listeners.csv")

In [4]:
artists_df.head()

Unnamed: 0,Artist,Listeners,Daily Trend,Peak,PkListeners
0,The Weeknd,107592328,-138880,1,113034886
1,Taylor Swift,101003302,889,2,101003302
2,Ed Sheeran,76475126,-68137,2,87934910
3,Dua Lipa,76421916,-71356,4,77778397
4,Bad Bunny,76162057,-199052,3,83950570


In [5]:
#artists_df.groupby('Artist').sum().head(60)

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist


def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)

    # Lowercase artist names
    df['Artist'] = df['Artist'].str.lower()

    # Clean numeric columns
    for col in ['Listeners', 'PkListeners', 'Daily Trend']:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace(',', '', regex=False), errors='coerce')

    # Drop rows where conversion failed
    df = df.dropna(subset=['Listeners', 'PkListeners', 'Daily Trend'])

    # Log transform
    df['listeners_log'] = np.log1p(df['Listeners'])
    df['pklisteners_log'] = np.log1p(df['PkListeners'])

    # Rename column to lowercase
    df = df.rename(columns={'Daily Trend': 'daily_trend'})

    return df


def find_similar_artists(df, artist_name, top_n=10):
    # Features to compare
    feature_cols = ['listeners_log', 'daily_trend', 'pklisteners_log']

    # Normalize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df[feature_cols])

    # Map artist names to indices
    artist_to_idx = {artist: idx for idx, artist in enumerate(df['Artist'])}

    # Lowercase query artist
    artist_name = artist_name.lower()

    # Check if artist exists
    if artist_name not in artist_to_idx:
        print(f"Artist '{artist_name}' not found in dataset.")
        return None

    query_idx = artist_to_idx[artist_name]
    query_vector = scaled_features[query_idx].reshape(1, -1)

    # Compute distances
    distances = cdist(query_vector, scaled_features, metric='euclidean')[0]

    # Convert to similarity
    similarity = 1 / (1 + distances)

    # Create DataFrame with similarity
    result_df = df.copy()
    result_df['similarity'] = similarity

    # Sort by similarity, exclude query artist
    result_df = result_df[result_df['Artist'] != artist_name]
    result_df = result_df.sort_values(by='similarity', ascending=False)

    return result_df[['Artist', 'similarity']].head(top_n)

# Example usage:
file_path = "/content/drive/MyDrive/Colab Notebooks/project/Music/listeners.csv"  # Your dataset path
df_clean = load_and_clean_data(file_path)
query_artist = input("Enter artist name: ").strip()
top_similar = find_similar_artists(df_clean, query_artist, top_n=10)

if top_similar is not None:
    print("\nTop similar artists:")
    print(top_similar.to_string(index=False))


Enter artist name: a
Artist 'a' not found in dataset.


In [9]:
def recommend_from_favorites(df, favorite_artists, top_n_per_artist=10):
    all_recommendations = []

    for artist in favorite_artists:
        print(f"\nProcessing artist: {artist}")
        similar = find_similar_artists(df, artist, top_n=top_n_per_artist)
        if similar is not None:
            all_recommendations.append(similar)
        else:
            print(f"Skipping {artist} (not found).")

    if not all_recommendations:
        print("No recommendations generated.")
        return None

    # Combine recommendations
    combined = pd.concat(all_recommendations, ignore_index=True)

    # Aggregate
    agg_df = combined.groupby('Artist').agg(
        count=('similarity', 'count'),
        sum_similarity=('similarity', 'sum'),
        avg_similarity=('similarity', 'mean')
    ).reset_index()

    # Exclude input favorite artists temporarily
    agg_df = agg_df[~agg_df['Artist'].isin(favorite_artists)]

    # Weighted score formula
    agg_df['weighted_score'] = agg_df['sum_similarity'] * agg_df['count']

    # Determine max current weighted score
    max_score = agg_df['weighted_score'].max() if not agg_df.empty else 0

    # Manually add favorite artists with score higher than max
    fav_artist_entries = []
    for artist in favorite_artists:
        if artist in df['Artist'].values:
            fav_artist_entries.append({
                'Artist': artist,
                'count': np.nan,
                'sum_similarity': np.nan,
                'avg_similarity': np.nan,
                'weighted_score': max_score + 1  # boost above all others
            })

    if fav_artist_entries:
        fav_df = pd.DataFrame(fav_artist_entries)
        agg_df = pd.concat([fav_df, agg_df], ignore_index=True)

    # Sort final recommendations
    agg_df = agg_df.sort_values(by='weighted_score', ascending=False)

    return agg_df


In [10]:
# User input
artist_input = input("Enter your favorite artists (comma-separated): ").strip()
artist_list = [a.strip().lower() for a in artist_input.split(',')]

# Get recommendations
weighted_recs = recommend_from_favorites(df_clean, artist_list, top_n_per_artist=20)

if weighted_recs is not None:
    print("\nWeighted recommended artists:")
    print(weighted_recs.to_string(index=False))


Enter your favorite artists (comma-separated): a

Processing artist: a
Artist 'a' not found in dataset.
Skipping a (not found).
No recommendations generated.


In [11]:
df2 = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/Music/top10k.csv")

In [12]:
df2.head()

Unnamed: 0.1,Unnamed: 0,index,artist,gender,age,type,country,city_1,district_1,city_2,district_2,city_3,district_3
0,0,0,Drake,male,33,person,CA,,,Toronto,,,
1,1,1,Post Malone,male,25,person,US,,,Syracuse,,,
2,2,2,Ed Sheeran,male,29,person,GB,,,Halifax,,,
3,3,3,J Balvin,male,35,person,CO,,,Medellín,,,
4,4,4,Bad Bunny,male,26,person,PR,,,San Juan,,,


In [22]:
#!pip install transformers
#!pip install torch
#!pip install gensim

In [23]:
# Step 1: Import necessary libraries
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 2: Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Step 3: Function to get BERT embeddings
def get_bert_embeddings(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Get BERT output (hidden states)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the mean of all token embeddings (representing the whole text)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings.squeeze().numpy()  # Convert tensor to numpy array

# Step 4: Example artist bios (this can be expanded with actual artist bios or data)
artist_bios = {
    'Elton John': "Elton John is an English singer, pianist, and composer. His music combines elements of pop, rock, and classical music.",
    'Coldplay': "Coldplay is a British rock band known for their melodic music, emotional lyrics, and successful global hits.",
    'Arctic Monkeys': "Arctic Monkeys is an English rock band formed in Sheffield. They are known for their fast-paced, indie rock music.",
    'The Beatles': "The Beatles were an English rock band formed in Liverpool. They are regarded as one of the most influential music groups in history.",
    'Adele': "Adele is an English singer-songwriter known for her soulful ballads and powerful vocal performances."
}

# Step 5: Get embeddings for each artist's bio
artist_embeddings = {}
for artist, bio in artist_bios.items():
    artist_embeddings[artist] = get_bert_embeddings(bio)

# Step 6: Calculate cosine similarity between artists
def find_similar_artists(target_artist, artist_embeddings):
    target_embedding = artist_embeddings[target_artist]

    # Extract all artist names and embeddings
    all_artists = list(artist_embeddings.keys())
    all_embeddings = np.array(list(artist_embeddings.values()))

    # Compute cosine similarity between the target artist and all other artists
    similarities = cosine_similarity([target_embedding], all_embeddings)[0]

    # Get artists sorted by similarity (highest first)
    similar_artists = sorted(zip(similarities, all_artists), reverse=True)

    return similar_artists

# Step 7: Find similar artists to 'Elton John'
target_artist = 'Elton John'
similar_artists = find_similar_artists(target_artist, artist_embeddings)

# Step 8: Print the most similar artists
print(f"Most similar artists to {target_artist}:")
for similarity, artist in similar_artists:
    print(f"{artist}: {similarity:.4f}")

Most similar artists to Elton John:
Elton John: 1.0000
Adele: 0.7909
Coldplay: 0.7748
The Beatles: 0.7560
Arctic Monkeys: 0.7309


In [24]:
!pip install transformers torch scikit-learn spotipy

Collecting spotipy
  Downloading spotipy-2.25.1-py3-none-any.whl.metadata (5.1 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-6.0.0-py3-none-any.whl.metadata (10 kB)
Downloading spotipy-2.25.1-py3-none-any.whl (31 kB)
Downloading redis-6.0.0-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.9/268.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: redis, spotipy
Successfully installed redis-6.0.0 spotipy-2.25.1


In [25]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Set up Spotify API authentication
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="YOUR_CLIENT_ID",
                                                            client_secret="YOUR_CLIENT_SECRET"))

In [29]:
!pip install requests



In [32]:
import requests

def search_artist(artist_name):
    """Search for an artist by name and return their Deezer artist ID."""
    url = f"https://api.deezer.com/search/artist?q={artist_name}"
    response = requests.get(url)
    data = response.json()
    if data['data']:
        artist = data['data'][0]
        return artist['id'], artist['name']
    else:
        return None, None

def get_similar_artists(artist_id):
    """Given a Deezer artist ID, return a list of similar artists."""
    url = f"https://api.deezer.com/artist/{artist_id}/related"
    response = requests.get(url)
    data = response.json()
    similar = [artist['name'] for artist in data['data']]
    return similar

# === MAIN ===
input_artist = "Abba"  # You can change this to any artist you like
artist_id, artist_name = search_artist(input_artist)

if artist_id:
    print(f"Found artist: {artist_name} (ID: {artist_id})")
    similar_artists = get_similar_artists(artist_id)
    print(f"\nSimilar artists to {artist_name}:")
    for artist in similar_artists:
        print(f"- {artist}")
else:
    print("Artist not found.")

Found artist: ABBA (ID: 180)

Similar artists to ABBA:
- Elton John
- Billy Joel
- Rod Stewart
- Michael Jackson
- Blondie
- Madonna
- Queen
- Cyndi Lauper
- Stevie Wonder
- Eurythmics
- Prince
- Bee Gees
- Neil Diamond
- Bonnie Tyler
- Electric Light Orchestra
- Phil Collins
- Joe Cocker
- Tina Turner
- Toto
- George Michael


In [34]:
import requests

def search_song(song_name):
    """Search for a song by title and return the first result's artist ID and name."""
    url = f"https://api.deezer.com/search?q={song_name}"
    response = requests.get(url)
    data = response.json()

    if data['data']:
        track = data['data'][0]
        artist = track['artist']
        print(f"Found song: '{track['title']}' by {artist['name']}")
        return artist['id'], artist['name']
    else:
        return None, None

def get_similar_artists(artist_id):
    """Given a Deezer artist ID, return a list of similar artists."""
    url = f"https://api.deezer.com/artist/{artist_id}/related"
    response = requests.get(url)
    data = response.json()
    similar = [artist['name'] for artist in data['data']]
    return similar

# === MAIN ===
song_input = "Do i wanna know"  # Try songs like "Bohemian Rhapsody", "Let It Be", etc.
artist_id, artist_name = search_song(song_input)

if artist_id:
    similar_artists = get_similar_artists(artist_id)
    print(f"\nArtists similar to {artist_name}:")
    for artist in similar_artists:
        print(f"- {artist}")
else:
    print("Song not found.")

Found song: 'Do I Wanna Know?' by Arctic Monkeys

Artists similar to Arctic Monkeys:
- The Last Shadow Puppets
- The Strokes
- Cage The Elephant
- The Kooks
- The Vaccines
- The Fratellis
- Modest Mouse
- Alex Turner
- Radiohead
- Franz Ferdinand
- Kings of Leon
- Interpol
- The Wombats
- Blur
- Bloc Party
- Tame Impala
- MGMT
- Kasabian
- The Killers
- The Drums


In [35]:
import requests
from collections import Counter

def get_artist_id(artist_name):
    """Search Deezer for an artist and return their ID."""
    url = f"https://api.deezer.com/search/artist?q={artist_name}"
    response = requests.get(url)
    data = response.json()
    if data['data']:
        return data['data'][0]['id']
    return None

def get_similar_artists(artist_id):
    """Fetch similar artists for a given artist ID."""
    url = f"https://api.deezer.com/artist/{artist_id}/related"
    response = requests.get(url)
    data = response.json()
    return [artist['name'] for artist in data['data']]

def recommend_artists(favorite_artists):
    """Combine similar artist suggestions from a list of favorite artists."""
    all_similar = []

    for name in favorite_artists:
        artist_id = get_artist_id(name)
        if artist_id:
            print(f"Found artist: {name} (ID: {artist_id}) — fetching similar artists...")
            similar = get_similar_artists(artist_id)
            all_similar.extend(similar)
        else:
            print(f"Could not find artist: {name}")

    # Count frequency of each suggested artist
    recommended = Counter(all_similar)

    # Filter out artists already in favorites
    recommended = {artist: count for artist, count in recommended.items() if artist not in favorite_artists}

    # Sort by highest count (i.e. most recommended)
    sorted_recs = sorted(recommended.items(), key=lambda x: x[1], reverse=True)

    return sorted_recs

# === Example Usage ===
favorite_artists = [
    "The Beatles",
    "Coldplay",
    "Arctic Monkeys",
    "Elton John",
    "Polo & Pan"
]

recommendations = recommend_artists(favorite_artists)

print("\nTop recommended artists for you:")
for artist, score in recommendations:
    print(f"- {artist} (score: {score})")

Found artist: The Beatles (ID: 1) — fetching similar artists...
Found artist: Coldplay (ID: 892) — fetching similar artists...
Found artist: Arctic Monkeys (ID: 1182) — fetching similar artists...
Found artist: Elton John (ID: 413) — fetching similar artists...
Found artist: Polo & Pan (ID: 5400149) — fetching similar artists...

Top recommended artists for you:
- Paul McCartney (score: 2)
- Queen (score: 2)
- The Rolling Stones (score: 1)
- John Lennon (score: 1)
- The Doors (score: 1)
- Bob Dylan (score: 1)
- David Bowie (score: 1)
- The Who (score: 1)
- The Beach Boys (score: 1)
- The Kinks (score: 1)
- The Animals (score: 1)
- George Harrison (score: 1)
- Pink Floyd (score: 1)
- Fleetwood Mac (score: 1)
- The Velvet Underground (score: 1)
- The Mamas & The Papas (score: 1)
- Santana (score: 1)
- Simon & Garfunkel (score: 1)
- Creedence Clearwater Revival (score: 1)
- Jimi Hendrix (score: 1)
- Maroon 5 (score: 1)
- Ed Sheeran (score: 1)
- Keane (score: 1)
- OneRepublic (score: 1)
- 

In [36]:
import requests
from collections import Counter
import json

def get_artist_id(artist_name):
    url = f"https://api.deezer.com/search/artist?q={artist_name}"
    response = requests.get(url)
    data = response.json()
    if data['data']:
        return data['data'][0]['id']
    return None

def get_similar_artists(artist_id):
    url = f"https://api.deezer.com/artist/{artist_id}/related"
    response = requests.get(url)
    data = response.json()
    return [artist['name'] for artist in data['data']]

def recommend_artists(favorite_artists, save_to_file=False):
    all_similar = []

    for name in favorite_artists:
        artist_id = get_artist_id(name)
        if artist_id:
            print(f"Found artist: {name} — fetching similar artists...")
            similar = get_similar_artists(artist_id)
            all_similar.extend(similar)
        else:
            print(f"Could not find artist: {name}")

    recommended = Counter(all_similar)

    # Filter out any artists already in favorites
    recommended = {
        artist: count for artist, count in recommended.items()
        if artist not in favorite_artists
    }

    if save_to_file:
        with open("recommended_artists.json", "w") as f:
            json.dump(recommended, f, indent=2)

    return recommended

# === Example Usage ===
favorite_artists = [
    "The Beatles",
    "Coldplay",
    "Arctic Monkeys",
    "Elton John",
    "Polo & Pan"
]

recommendations = recommend_artists(favorite_artists, save_to_file=True)

print("\nTop recommended artists with scores:")
for artist, score in sorted(recommendations.items(), key=lambda x: x[1], reverse=True):
    print(f"- {artist}: {score}")

Found artist: The Beatles — fetching similar artists...
Found artist: Coldplay — fetching similar artists...
Found artist: Arctic Monkeys — fetching similar artists...
Found artist: Elton John — fetching similar artists...
Found artist: Polo & Pan — fetching similar artists...

Top recommended artists with scores:
- Paul McCartney: 2
- Queen: 2
- The Rolling Stones: 1
- John Lennon: 1
- The Doors: 1
- Bob Dylan: 1
- David Bowie: 1
- The Who: 1
- The Beach Boys: 1
- The Kinks: 1
- The Animals: 1
- George Harrison: 1
- Pink Floyd: 1
- Fleetwood Mac: 1
- The Velvet Underground: 1
- The Mamas & The Papas: 1
- Santana: 1
- Simon & Garfunkel: 1
- Creedence Clearwater Revival: 1
- Jimi Hendrix: 1
- Maroon 5: 1
- Ed Sheeran: 1
- Keane: 1
- OneRepublic: 1
- Snow Patrol: 1
- Imagine Dragons: 1
- Sia: 1
- George Ezra: 1
- The Fray: 1
- Florence + The Machine: 1
- Ellie Goulding: 1
- The Script: 1
- Charlie Puth: 1
- P!nk: 1
- Sam Smith: 1
- Bastille: 1
- Birdy: 1
- Robbie Williams: 1
- Miley Cyru

In [38]:
recommendations

{'The Rolling Stones': 1,
 'John Lennon': 1,
 'Paul McCartney': 2,
 'The Doors': 1,
 'Bob Dylan': 1,
 'David Bowie': 1,
 'Queen': 2,
 'The Who': 1,
 'The Beach Boys': 1,
 'The Kinks': 1,
 'The Animals': 1,
 'George Harrison': 1,
 'Pink Floyd': 1,
 'Fleetwood Mac': 1,
 'The Velvet Underground': 1,
 'The Mamas & The Papas': 1,
 'Santana': 1,
 'Simon & Garfunkel': 1,
 'Creedence Clearwater Revival': 1,
 'Jimi Hendrix': 1,
 'Maroon 5': 1,
 'Ed Sheeran': 1,
 'Keane': 1,
 'OneRepublic': 1,
 'Snow Patrol': 1,
 'Imagine Dragons': 1,
 'Sia': 1,
 'George Ezra': 1,
 'The Fray': 1,
 'Florence + The Machine': 1,
 'Ellie Goulding': 1,
 'The Script': 1,
 'Charlie Puth': 1,
 'P!nk': 1,
 'Sam Smith': 1,
 'Bastille': 1,
 'Birdy': 1,
 'Robbie Williams': 1,
 'Miley Cyrus': 1,
 'Lady Gaga': 1,
 'The Last Shadow Puppets': 1,
 'The Strokes': 1,
 'Cage The Elephant': 1,
 'The Kooks': 1,
 'The Vaccines': 1,
 'The Fratellis': 1,
 'Modest Mouse': 1,
 'Alex Turner': 1,
 'Radiohead': 1,
 'Franz Ferdinand': 1,
 'Ki