# Data Preprocessing

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load dataset
df_tsv = pd.read_csv('userid-timestamp-artid-artname-traid-traname.tsv', delimiter='\t', error_bad_lines=False)

# view dataset
df_tsv

In [None]:
# add column names to df_tsv
col_name = ['user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name']
df_tsv.columns = col_name

# show new dataset
df_tsv

In [None]:
# check shape of dataset
df_tsv.shape

In [None]:
# check datatypes of cols
df_tsv.info()

In [None]:
# check missing/null values
import missingno as msno

#plot missing/null values matrix
msno.bar(df_tsv, figsize=(8,6))

In [None]:
# check null values
df_tsv.isnull().sum()

In [None]:
# percentage of missing values
miss_percent = (df_tsv.isnull().mean()) * 100
miss_percent

In [None]:
# drop unneeded columns
unnec_cols = ['artist_id', 'track_id', 'timestamp']
df_tsv = df_tsv.drop(df_tsv[unnec_cols], axis=1)

# Data Visualization

In [None]:
# Top 10 artists
top_10_arts = df_tsv['artist_name'].value_counts().nlargest(10)

plt.figure(figsize=(8,8))
sns.barplot(x=top_10_arts.index, y=top_10_arts.values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Artist Name')
plt.ylabel('Number of Interactions')
plt.title('Top 10 Most Popular Artists', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Most Popular songs

top_10_songs = df_tsv['track_name'].value_counts().nlargest(10)

plt.figure(figsize=(8,8))
sns.barplot(x=top_10_songs.index, y=top_10_songs.values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Popular Songs')
plt.ylabel('Number of Times Listened To')
plt.title('Top 10 Most Popular Songs', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Add more here

In [None]:
# And here

In [None]:
# And here

# Data Transformation

In [None]:
# create a listen column
df_tsv['listens'] = 1

# create user-item matrix
user_interaction_matrix = pd.pivot_table(df_tsv, index='user_id', columns='track_name', values='listens', aggfunc='sum', 
                                    fill_value=0)

# Model Creation

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Predict optimal K-means clusters
inertia = []
for num_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(user_interaction_matrix)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method to show optimal number of clusters required
plt.plot(range(1, 11), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Cluster Number')
plt.show()

# Build Recommender System

In [None]:
def music_recommendation(user_id, num_recommendations):
    # Find the cluster that the user belongs to
    user_cluster = kmeans.predict(user_interaction_matrix.loc[user_id].values.reshape(1, -1))
    
    # Get the indices of the songs or artists in the cluster
    cluster_indices = user_interaction_matrix.index[kmeans.labels_ == user_cluster[0]]
    
    # Filter out the songs or artists that the user has already listened to
    recommendations = user_interaction_matrix.loc[cluster_indices, user_interaction_matrix.loc[user_id] == 0]
    
    # Sort the recommendations based on listens and get the top 'num_recommendations' songs or artists
    top_recommendations = recommendations.sum(axis=0).sort_values(ascending=False).head(num_recommendations)
    
    return top_recommendations.index.tolist()

# DUMMY CODE:
#user_id = 'User123'  # Replace with an actual user ID from the dataset
#num_recommendations = 10
#recommended_songs = music_recommendation(user_id, num_recommendations)
#print(f"Recommended songs for {user_id}:")
#print(recommended_songs)


# Evaluation

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
train_data, test_data = train_test_split(df_tsv, test_size=0.2, random_state=42)

# Create a dictionary to store recommended songs for each user in the test set
recommended_songs_dict = {}

# Generate recommendations for each user in the test set
for user_id in test_data['user_id'].unique():
    recommended_songs = music_recommendation(user_id, num_recommendations)
    recommended_songs_dict[user_id] = recommended_songs

# SOME DUMMY CODE!!!
# Compare recommendations to actual music listened to by users in the test set
# ... perform evaluation and metrics calculation ...

# Example: Compare the recommended songs to the actual songs in the test set for a specific user
#user_id_to_evaluate = 'User456'
#actual_songs_listened = test_data[test_data['user_id'] == user_id_to_evaluate]['song_or_artist'].tolist()
#print(f"Actual songs listened by {user_id_to_evaluate}:")
#print(actual_songs_listened)
#print(f"Recommended songs for {user_id_to_evaluate}:")
#print(recommended_songs_dict[user_id_to_evaluate])