In [107]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilraghavbhatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhilraghavbhatt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [108]:
df = pd.read_csv("/Users/nikhilraghavbhatt/Desktop/4300/test_df.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,lyrics
0,0,Killa Cam,rap,Cam'ron,2004,173166,"Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca..."
1,1,Can I Live,rap,JAY-Z,1996,468624,"Yeah, hah, yeah, Roc-A-Fella We invite you ..."
2,2,Forgive Me Father,rap,Fabolous,2003,4743,Maybe cause I'm eatin And these bastards fiend...
3,3,Down and Out,rap,Cam'ron,2004,144404,"Ugh, Killa! Baby! Kanye, this that 1970s He..."
4,4,Fly In,rap,Lil Wayne,2005,78271,"So they ask me ""Young boy What you gon' do th..."


In [109]:
# TRANSFORMING THE DATA TO GROUP BY ARTIST

# all the titles and lyrics for a given artist are concatenated into a single string 
artist_df = df.groupby('artist').agg({
    'title': ' '.join,  
    'views': 'mean',           
    'lyrics': ' '.join,    
    'tag': lambda x: list(x.unique())
}).reset_index()

artist_df.rename(columns={'title': 'song_titles', 'views': 'average_views', 'lyrics': 'concatenated_lyrics', 'tag': 'tags'}, inplace=True)

print(artist_df.iloc[0])

artist                                                              2Pac
song_titles            To Live and Die in L.A. Changes Hail Mary Cali...
average_views                                              657845.892857
concatenated_lyrics     “Street Science, you’re on the air. What do y...
tags                                                               [rap]
Name: 0, dtype: object


In [111]:
#PREPROCESSING LYRIC DATA

#lower case
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: x.lower())

#removing stop words
stop_words = set(stopwords.words('english'))
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#removing punctuation and special characters
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#removing numbers
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

#converting words to lemmatized/base form
lemmatizer = WordNetLemmatizer()
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#check
print(artist_df['concatenated_lyrics'].head())

0    “street science you’re air feel hear record li...
1    new york city rockin cent gotta love wanna chi...
2    az ha ha ha new drink bailey henny aint got he...
3    intro jayz aaliyah sup baby girl ohhh thought ...
4    yo cold tryin give murder rap aint even like y...
Name: concatenated_lyrics, dtype: object


In [112]:
# ARTIST SIMILARITY USING ONLY LYRICAL SIMILARITY

# finding tfidf scores
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(artist_df['concatenated_lyrics'])

# finding cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=artist_df['artist'], columns=artist_df['artist'])

# get similarity scores for Eminem
similar_artists = cosine_sim_df['Eminem'].sort_values(ascending=False)
print(similar_artists[1:])


artist
Lil Wayne          0.529491
JAY-Z              0.522020
Drake              0.493340
Kanye West         0.460687
Joe Budden         0.460440
                     ...   
Jeru the Damaja    0.031196
Bomfunk MC's       0.027317
Trillville         0.023646
The Clash          0.021447
Bizzy Bone         0.018714
Name: Eminem, Length: 261, dtype: float64


In [114]:
# CHECKING TOP 10 SIGNIFICANT LYRICS FOR A GIVEN ARTIST

feature_names = tfidf_vectorizer.get_feature_names()
artist_index = artist_df.index[artist_df['artist'] == "Eminem"][0]
artist_vector = tfidf_matrix[artist_index].toarray().flatten()
top_words = pd.Series(artist_vector, index=feature_names).sort_values(ascending=False)

print(top_words)

im                         0.360740
like                       0.198072
shady                      0.177218
cause                      0.145833
superman                   0.135389
                             ...   
mr carter                  0.000000
drop allallallallallall    0.000000
moët                       0.000000
moving world               0.000000
aaliyah                    0.000000
Length: 10000, dtype: float64


In [115]:
#PREPROCESSING SONG TITLE DATA

#lower case
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: x.lower())

#removing stop words
stop_words = set(stopwords.words('english'))
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#removing punctuation and special characters
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#removing numbers
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

#converting words to lemmatized/base form
lemmatizer = WordNetLemmatizer()
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#check
print(artist_df['song_titles'].head())

0    live die la change hail mary california love g...
1    question da club rob patiently waiting ghetto ...
2          love love sunshine time hey az never change
3                                           miss remix
4                                           murder rap
Name: song_titles, dtype: object


In [89]:
# FINDING ARTIST SIMILARITY USING ONLY SONG TITLES

#finding tfidf scores
tfidf_vectorizer_title = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))
tfidf_matrix_title = tfidf_vectorizer_title.fit_transform(artist_df['song_titles'])

#finding cosine similarity
cosine_sim_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
cosine_sim_df_title = pd.DataFrame(cosine_sim_title, index=artist_df['artist'], columns=artist_df['artist'])

# get similarity scores for Eminem
similar_artists_title = cosine_sim_df_title['Eminem'].sort_values(ascending=False)
print(similar_artists_title[1:])  # start from index 1 to skip 'Cam\'ron'

artist
Neil Young                              0.120735
CeeLo Green                             0.113629
Gucci Mane                              0.078162
Common                                  0.068720
Spice 1                                 0.061634
                                          ...   
Glasses Malone                          0.000000
Goodie Mob                              0.000000
Grandmaster Flash & The Furious Five    0.000000
Gudda Gudda                             0.000000
eLZhi                                   0.000000
Name: Eminem, Length: 261, dtype: float64


In [116]:
#NOW WE MAKE THE COMPOSITE VECTOR BY COMBINING THE FEATURES WE HAVE

#NORMALIZING AVERAGE VIEW COUNTS - feature in composite vector

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
view_counts_normalized = scaler.fit_transform(artist_df[['average_views']])


In [117]:
#VECTORIZATION OF TAGS (GENRES) - feature in composite vector

from sklearn.preprocessing import MultiLabelBinarizer

tag_binarizer = MultiLabelBinarizer()
tags_transformed = tag_binarizer.fit_transform(artist_df['tags'])

In [118]:
# ARTIST SIMILARITY WITH COMPOSITE VECTOR

# defining weights for features
weight_titles = 0.3 #since song titles did not give accurate results, weighted less
weight_lyrics = 1.5 #weighted more since it was more accurate
weight_views = 1.0 
weight_tags = 1.0

# applying weights to features
weighted_titles = tfidf_matrix_title.toarray() * weight_titles
weighted_lyrics = tfidf_matrix.toarray() * weight_lyrics
weighted_views = view_counts_normalized * weight_views
weighted_tags = tags_transformed * weight_tags

#MAKING THE COMPOSITE VECTOR

features_combined_weighted = np.hstack((weighted_titles, weighted_lyrics, weighted_views, weighted_tags))

#COSINE SIMILARITY FOR COMPOSITE VECTOR

cosine_sim_weighted = cosine_similarity(features_combined_weighted, features_combined_weighted)
cosine_sim_df_weighted = pd.DataFrame(cosine_sim_weighted, index=artist_df['artist'], columns=artist_df['artist'])


In [119]:
# GET SIMILAR ARTISTS TO A SPECIFIED ARTIST WITH SCORES USING COMPOSITE VECTOR

# checking Jay-Z
similar_artists_composite = cosine_sim_df_weighted['JAY-Z'].sort_values(ascending=False)
print(similar_artists_composite[1:])  # start from index 1 to skip Eminem himself


artist
Lil Wayne          0.673252
Eminem             0.652094
Joe Budden         0.646191
Nas                0.643094
2Pac               0.634920
                     ...   
Vampire Weekend    0.031139
Arcade Fire        0.025727
Tom Tom Club       0.023277
Lady Gaga          0.023122
The Clash          0.010239
Name: JAY-Z, Length: 261, dtype: float64


In [120]:
# ROCCHIO FEEDBACK FOR COMPOSITE VECTOR

query_artist = 'JAY-Z'

#feedback - relevant and irrelevant artists
relevant_artists = ['Lil Wayne', 'Drake']
irrelevant_artists = ['Joe Budden', 'Lady Gaga']

query_index = artist_df.index[artist_df['artist'] == query_artist].tolist()[0]
relevant_indices = artist_df.index[artist_df['artist'].isin(relevant_artists)].tolist()
irrelevant_indices = artist_df.index[artist_df['artist'].isin(irrelevant_artists)].tolist()

def mean_vector(indices, feature_matrix):
    return np.mean(feature_matrix[indices, :], axis=0)

# finding relevant and irrelevant vectors
original_query_vector = features_combined_weighted[query_index, :]
mean_relevant = mean_vector(relevant_indices, features_combined_weighted)
mean_irrelevant = mean_vector(irrelevant_indices, features_combined_weighted)

# rocchio parameters
alpha, beta, gamma = 1.0, 0.75, 0.25

# update query vector after performing rocchio
updated_query_vector = (alpha * original_query_vector +
                        beta * mean_relevant -
                        gamma * mean_irrelevant)

# updated recommendations
updated_similarities = cosine_similarity(updated_query_vector.reshape(1, -1), features_combined_weighted)
updated_similarities_df = pd.DataFrame(updated_similarities, columns=artist_df['artist'], index=['Similarity']).T
updated_recommendations = updated_similarities_df.sort_values(by='Similarity', ascending=False)



In [121]:
print("Updated Recommendations after Rocchio Feedback:")
print(updated_recommendations[1:])

Updated Recommendations after Rocchio Feedback:
                 Similarity
artist                     
Lil Wayne          0.812529
Drake              0.735319
Eminem             0.711903
2Pac               0.672750
Nas                0.669944
...                     ...
Robin Thicke       0.062740
Brandon Flowers    0.051135
Arcade Fire        0.029079
Lady Gaga          0.015760
The Clash          0.009760

[261 rows x 1 columns]
