In [311]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import gensim
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import Sparse2Corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilraghavbhatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhilraghavbhatt/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [312]:
df = pd.read_csv("/Users/nikhilraghavbhatt/Desktop/4300/test_df.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,tag,artist,year,views,lyrics
0,0,Killa Cam,rap,Cam'ron,2004,173166,"Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca..."
1,1,Can I Live,rap,JAY-Z,1996,468624,"Yeah, hah, yeah, Roc-A-Fella We invite you ..."
2,2,Forgive Me Father,rap,Fabolous,2003,4743,Maybe cause I'm eatin And these bastards fiend...
3,3,Down and Out,rap,Cam'ron,2004,144404,"Ugh, Killa! Baby! Kanye, this that 1970s He..."
4,4,Fly In,rap,Lil Wayne,2005,78271,"So they ask me ""Young boy What you gon' do th..."


In [370]:
# TRANSFORMING THE DATA TO GROUP BY ARTIST

# all the titles and lyrics for a given artist are concatenated into a single string 
artist_df = df.groupby('artist').agg({
    'title': ' '.join,  
    'views': 'mean',           
    'lyrics': ' '.join,    
    'tag': lambda x: list(x.unique())
}).reset_index()

artist_df.rename(columns={'title': 'song_titles', 'views': 'average_views', 'lyrics': 'concatenated_lyrics', 'tag': 'tags'}, inplace=True)

print(artist_df.head())

          artist                                        song_titles  \
0           2Pac  To Live and Die in L.A. Changes Hail Mary Cali...   
1        50 Cent  21 Questions In da Club How to Rob Patiently W...   
2             AZ     Love Is Love Sunshine Time Hey AZ Never Change   
3        Aaliyah                                     Miss You Remix   
4  Above the Law                                         Murder Rap   

   average_views                                concatenated_lyrics   tags  
0  657845.892857   “Street Science, you’re on the air. What do y...  [rap]  
1  599705.181818   New York City You are now rockin' With 50 Cen...  [rap]  
2    7434.600000   AZ Ha, ha, ha, new drink, Baileys and Henny Y...  [rap]  
3   47079.000000  [Intro: Jay-Z] + (Aaliyah) Sup Baby Girl (Ohhh...   [rb]  
4  106282.000000   "Yo, Cold 187. They tryin' to give you a murd...  [rap]  


In [371]:
#PREPROCESSING LYRIC DATA

#lower case
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: x.lower())

#removing stop words
stop_words = set(stopwords.words('english'))
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#removing punctuation and special characters
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#removing numbers
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

#converting words to lemmatized/base form
lemmatizer = WordNetLemmatizer()
artist_df['concatenated_lyrics'] = artist_df['concatenated_lyrics'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#check
print(artist_df['concatenated_lyrics'].head())

0    “street science you’re air feel hear record li...
1    new york city rockin cent gotta love wanna chi...
2    az ha ha ha new drink bailey henny aint got he...
3    intro jayz aaliyah sup baby girl ohhh thought ...
4    yo cold tryin give murder rap aint even like t...
Name: concatenated_lyrics, dtype: object


In [372]:
# ARTIST SIMILARITY USING ONLY LYRICAL SIMILARITY

# finding tfidf scores
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))
tfidf_matrix_lyrics = tfidf_vectorizer.fit_transform(artist_df['concatenated_lyrics'])

# finding cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix_lyrics, tfidf_matrix_lyrics)
cosine_sim_df = pd.DataFrame(cosine_sim, index=artist_df['artist'], columns=artist_df['artist'])

# get similarity scores for Eminem
similar_artists = cosine_sim_df['Eminem'].sort_values(ascending=False)
print(similar_artists[1:])


artist
Lil Wayne          0.529341
JAY-Z              0.522536
Drake              0.493903
Kanye West         0.460276
Joe Budden         0.459820
                     ...   
Jeru the Damaja    0.031226
Bomfunk MC's       0.027302
Trillville         0.023635
The Clash          0.021472
Bizzy Bone         0.018276
Name: Eminem, Length: 261, dtype: float64


In [373]:
# CHECKING TOP 10 SIGNIFICANT LYRICS FOR A GIVEN ARTIST

feature_names = tfidf_vectorizer.get_feature_names()
artist_index = artist_df.index[artist_df['artist'] == "Eminem"][0]
artist_vector = tfidf_matrix[artist_index].toarray().flatten()
top_words = pd.Series(artist_vector, index=feature_names).sort_values(ascending=False)

print(top_words)

im            0.360540
like          0.197962
shady         0.177120
cause         0.145752
superman      0.135314
                ...   
invest        0.000000
intuition     0.000000
introduced    0.000000
interview     0.000000
like day      0.000000
Length: 10000, dtype: float64


In [374]:
#PREPROCESSING SONG TITLE DATA

#lower case
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: x.lower())

#removing stop words
stop_words = set(stopwords.words('english'))
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#removing punctuation and special characters
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#removing numbers
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

#converting words to lemmatized/base form
lemmatizer = WordNetLemmatizer()
artist_df['song_titles'] = artist_df['song_titles'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#check
print(artist_df['song_titles'].head())

0    live die la change hail mary california love g...
1    question da club rob patiently waiting ghetto ...
2          love love sunshine time hey az never change
3                                           miss remix
4                                           murder rap
Name: song_titles, dtype: object


In [318]:
print(artist_df.iloc[1])

artist_df.to_csv("/Users/nikhilraghavbhatt/Desktop/4300/new_data.csv", index = False)

artist                                                           50 Cent
song_titles            question da club rob patiently waiting ghetto ...
average_views                                              599705.181818
concatenated_lyrics    new york city rockin cent gotta love wanna chi...
tags                                                               [rap]
Name: 1, dtype: object


In [375]:
# FINDING ARTIST SIMILARITY USING ONLY SONG TITLES

#finding tfidf scores
tfidf_vectorizer_title = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))
tfidf_matrix_title = tfidf_vectorizer_title.fit_transform(artist_df['song_titles'])

#finding cosine similarity
cosine_sim_title = cosine_similarity(tfidf_matrix_title, tfidf_matrix_title)
cosine_sim_df_title = pd.DataFrame(cosine_sim_title, index=artist_df['artist'], columns=artist_df['artist'])

# get similarity scores for Eminem
similar_artists_title = cosine_sim_df_title['Eminem'].sort_values(ascending=False)
print(similar_artists_title[1:])  # start from index 1 to skip 'Cam\'ron'

artist
Neil Young                              0.120735
CeeLo Green                             0.113629
Gucci Mane                              0.078162
Common                                  0.068720
Spice 1                                 0.061634
                                          ...   
Glasses Malone                          0.000000
Goodie Mob                              0.000000
Grandmaster Flash & The Furious Five    0.000000
Gudda Gudda                             0.000000
eLZhi                                   0.000000
Name: Eminem, Length: 261, dtype: float64


In [376]:
#NOW WE MAKE THE COMPOSITE VECTOR BY COMBINING THE FEATURES WE HAVE

#NORMALIZING AVERAGE VIEW COUNTS - feature in composite vector

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
view_counts_normalized = scaler.fit_transform(artist_df[['average_views']])


In [377]:
#VECTORIZATION OF TAGS (GENRES) - feature in composite vector

from sklearn.preprocessing import MultiLabelBinarizer

tag_binarizer = MultiLabelBinarizer()
tags_transformed = tag_binarizer.fit_transform(artist_df['tags'])

In [378]:
# ARTIST SIMILARITY WITH COMPOSITE VECTOR

# defining weights for features
weight_titles = 0.3 #since song titles did not give accurate results, weighted less
weight_lyrics = 1.5 #weighted more since it was more accurate
weight_views = 1.0 
weight_tags = 1.0

# applying weights to features
weighted_titles = tfidf_matrix_title.toarray() * weight_titles
weighted_lyrics = tfidf_matrix_lyrics.toarray() * weight_lyrics
weighted_views = view_counts_normalized * weight_views
weighted_tags = tags_transformed * weight_tags

#MAKING THE COMPOSITE VECTOR

features_combined_weighted = np.hstack((weighted_titles, weighted_lyrics, weighted_views, weighted_tags))

#COSINE SIMILARITY FOR COMPOSITE VECTOR

cosine_sim_weighted = cosine_similarity(features_combined_weighted, features_combined_weighted)
cosine_sim_df_weighted = pd.DataFrame(cosine_sim_weighted, index=artist_df['artist'], columns=artist_df['artist'])


In [379]:
# GET SIMILAR ARTISTS TO A SPECIFIED ARTIST WITH SCORES USING COMPOSITE VECTOR

# checking Jay-Z
similar_artists_composite = cosine_sim_df_weighted['JAY-Z'].sort_values(ascending=False)
print(similar_artists_composite[1:])  # start from index 1 to skip Eminem himself


artist
Lil Wayne          0.673522
Eminem             0.652441
Joe Budden         0.645929
Nas                0.642576
2Pac               0.634761
                     ...   
Vampire Weekend    0.034378
Arcade Fire        0.025737
Tom Tom Club       0.023273
Lady Gaga          0.023129
The Clash          0.010240
Name: JAY-Z, Length: 261, dtype: float64


In [380]:
# ROCCHIO FEEDBACK FOR COMPOSITE VECTOR

query_artist = 'JAY-Z'

#feedback - relevant and irrelevant artists
relevant_artists = ['Lil Wayne', 'Drake']
irrelevant_artists = ['Joe Budden', 'Lady Gaga']

query_index = artist_df.index[artist_df['artist'] == query_artist].tolist()[0]
relevant_indices = artist_df.index[artist_df['artist'].isin(relevant_artists)].tolist()
irrelevant_indices = artist_df.index[artist_df['artist'].isin(irrelevant_artists)].tolist()

def mean_vector(indices, feature_matrix):
    return np.mean(feature_matrix[indices, :], axis=0)

# finding relevant and irrelevant vectors
original_query_vector = features_combined_weighted[query_index, :]
mean_relevant = mean_vector(relevant_indices, features_combined_weighted)
mean_irrelevant = mean_vector(irrelevant_indices, features_combined_weighted)

# rocchio parameters
alpha, beta, gamma = 1.0, 0.75, 0.25

# update query vector after performing rocchio
updated_query_vector = (alpha * original_query_vector +
                        beta * mean_relevant -
                        gamma * mean_irrelevant)

# updated recommendations
updated_similarities = cosine_similarity(updated_query_vector.reshape(1, -1), features_combined_weighted)
updated_similarities_df = pd.DataFrame(updated_similarities, columns=artist_df['artist'], index=['Similarity']).T
updated_recommendations = updated_similarities_df.sort_values(by='Similarity', ascending=False)



In [381]:
print("Updated Recommendations after Rocchio Feedback:")
print(updated_recommendations[1:])

Updated Recommendations after Rocchio Feedback:
                 Similarity
artist                     
Lil Wayne          0.812612
Drake              0.735739
Eminem             0.712075
2Pac               0.672533
Nas                0.669347
...                     ...
Robin Thicke       0.062748
Brandon Flowers    0.051154
Arcade Fire        0.029089
Lady Gaga          0.015768
The Clash          0.009771

[261 rows x 1 columns]


In [382]:
#REVIEW AND SCORE DATA CLEANING AND MERGING WITH MAIN ARTIST DATAFRAME

#combining review and score 
reviews = pd.read_csv("/Users/nikhilraghavbhatt/Desktop/4300/artist_reviews_1.csv")

reviews_score = pd.read_csv("/Users/nikhilraghavbhatt/Desktop/4300/reviews_score.csv")

score = reviews_score["average_scores"]

reviews = pd.concat([reviews, score.rename("average_score")], axis=1)

reviews = reviews.rename(columns={"Artist": "artist"})


#PREPROCESSING LYRIC DATA

reviews['Reviews'] = reviews['Reviews'].fillna(value='')

#lower case
reviews['Reviews'] = reviews['Reviews'].apply(lambda x: str(x).lower())

#removing stop words
stop_words = set(stopwords.words('english')) 
reviews['Reviews'] = reviews['Reviews'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

#removing punctuation and special characters
reviews['Reviews'] = reviews['Reviews'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#removing numbers
reviews['Reviews'] = reviews['Reviews'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

#converting words to lemmatized/base form
lemmatizer = WordNetLemmatizer()
reviews['Reviews'] = reviews['Reviews'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

#check
print(reviews.head())

       artist                                            Reviews  \
0        2Pac  about mile north manhattan sits clinton correc...   
1     50 Cent  “i even saw keeping realwent style it’s like d...   
2          AZ                                                      
3     Aaliyah  whether believe afterlife not it’s easy enough...   
4  Aesop Rock  aesop rock open tenth album parody silicon val...   

   average_score  
0       6.750000  
1       6.315625  
2       0.000000  
3       9.300000  
4       8.240723  


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment_obj = SentimentIntensityAnalyzer()
sentiment_df = pd.DataFrame(columns = ["neg", "neu", "pos", "compound"])

for x in reviews['Reviews']:
    sentiment_dict = sentiment_obj.polarity_scores(str(x))
    sentiment_df = sentiment_df.append(sentiment_dict, ignore_index = True)

sentiment_df

In [384]:
reviews = pd.concat([reviews, sentiment_df], axis=1)
reviews

#this reviews dataframe has the text of the Pitchfork review and also the average album scores for each artist

Unnamed: 0,artist,Reviews,average_score,neg,neu,pos,compound
0,2Pac,about mile north manhattan sits clinton correc...,6.750000,0.137,0.741,0.122,-0.9947
1,50 Cent,“i even saw keeping realwent style it’s like d...,6.315625,0.134,0.681,0.185,0.9995
2,AZ,,0.000000,0.000,1.000,0.000,0.0000
3,Aaliyah,whether believe afterlife not it’s easy enough...,9.300000,0.068,0.756,0.176,0.9982
4,Aesop Rock,aesop rock open tenth album parody silicon val...,8.240723,0.112,0.731,0.157,0.9998
...,...,...,...,...,...,...,...
201,Yo Gotti,yo gotti rapping reliably selling drug memphis...,6.106250,0.090,0.754,0.156,0.9991
202,Young Money,after recording staggering amount verse hook h...,7.400000,0.116,0.728,0.156,0.9874
203,YoungBloodZ,,0.000000,0.000,1.000,0.000,0.0000
204,dead prez,to woke stay woke gap adjective verb contains ...,5.150000,0.153,0.698,0.149,-0.9849


In [385]:
#artist_df = pd.merge(artist_df, reviews, on='artist', how='left')
artist_df
#combining the consolidated artist Pitchfork review data with the existing artist_df 

Unnamed: 0,artist,song_titles,average_views,concatenated_lyrics,tags,Reviews,average_score,neg,neu,pos,compound
0,2Pac,live die la change hail mary california love g...,657845.892857,“street science you’re air feel hear record li...,[rap],about mile north manhattan sits clinton correc...,6.750000,0.137,0.741,0.122,-0.9947
1,50 Cent,question da club rob patiently waiting ghetto ...,599705.181818,new york city rockin cent gotta love wanna chi...,[rap],“i even saw keeping realwent style it’s like d...,6.315625,0.134,0.681,0.185,0.9995
2,AZ,love love sunshine time hey az never change,7434.600000,az ha ha ha new drink bailey henny aint got he...,[rap],,0.000000,0.000,1.000,0.000,0.0000
3,Aaliyah,miss remix,47079.000000,intro jayz aaliyah sup baby girl ohhh thought ...,[rb],whether believe afterlife not it’s easy enough...,9.300000,0.068,0.756,0.176,0.9982
4,Above the Law,murder rap,106282.000000,yo cold tryin give murder rap aint even like t...,[rap],,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
257,Yo Gotti,star remix,113577.000000,chea yeah ay know im lookin fivestar bitch tal...,[rap],yo gotti rapping reliably selling drug memphis...,6.106250,0.090,0.754,0.156,0.9991
258,Young Money,steady mobbin’ bedrock roger,627421.333333,man fuck nigga ima spare everything nigga flip...,[rap],after recording staggering amount verse hook h...,7.400000,0.116,0.728,0.156,0.9874
259,YoungBloodZ,damn,52130.000000,calling come back street sean p aka a sharp cr...,[rap],,0.000000,0.000,1.000,0.000,0.0000
260,dead prez,mind sex hell yeah remix,47623.500000,time mind sex aint got take clothes yet burn i...,[rap],to woke stay woke gap adjective verb contains ...,5.150000,0.153,0.698,0.149,-0.9849


In [386]:
# ARTIST_DF NULL VALUES HANDLED GRACEFULLY

artist_df['Reviews'] = artist_df['Reviews'].fillna(value='')
artist_df['average_score'] = artist_df['average_score'].fillna(value=0.0)
artist_df['pos'] = artist_df['pos'].fillna(value=0.0)
artist_df['neu'] = artist_df['neu'].fillna(value=1.0)
artist_df['neg'] = artist_df['neg'].fillna(value=0.0)
artist_df['compound'] = artist_df['compound'].fillna(value=0.0)
artist_df

Unnamed: 0,artist,song_titles,average_views,concatenated_lyrics,tags,Reviews,average_score,neg,neu,pos,compound
0,2Pac,live die la change hail mary california love g...,657845.892857,“street science you’re air feel hear record li...,[rap],about mile north manhattan sits clinton correc...,6.750000,0.137,0.741,0.122,-0.9947
1,50 Cent,question da club rob patiently waiting ghetto ...,599705.181818,new york city rockin cent gotta love wanna chi...,[rap],“i even saw keeping realwent style it’s like d...,6.315625,0.134,0.681,0.185,0.9995
2,AZ,love love sunshine time hey az never change,7434.600000,az ha ha ha new drink bailey henny aint got he...,[rap],,0.000000,0.000,1.000,0.000,0.0000
3,Aaliyah,miss remix,47079.000000,intro jayz aaliyah sup baby girl ohhh thought ...,[rb],whether believe afterlife not it’s easy enough...,9.300000,0.068,0.756,0.176,0.9982
4,Above the Law,murder rap,106282.000000,yo cold tryin give murder rap aint even like t...,[rap],,0.000000,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...
257,Yo Gotti,star remix,113577.000000,chea yeah ay know im lookin fivestar bitch tal...,[rap],yo gotti rapping reliably selling drug memphis...,6.106250,0.090,0.754,0.156,0.9991
258,Young Money,steady mobbin’ bedrock roger,627421.333333,man fuck nigga ima spare everything nigga flip...,[rap],after recording staggering amount verse hook h...,7.400000,0.116,0.728,0.156,0.9874
259,YoungBloodZ,damn,52130.000000,calling come back street sean p aka a sharp cr...,[rap],,0.000000,0.000,1.000,0.000,0.0000
260,dead prez,mind sex hell yeah remix,47623.500000,time mind sex aint got take clothes yet burn i...,[rap],to woke stay woke gap adjective verb contains ...,5.150000,0.153,0.698,0.149,-0.9849


In [450]:
# FINDING ARTIST SIMILARITY BASED ONLY ON REVIEW TEXT SIMILARITY

extended_stopwords = set(stopwords.words('english'))
additional_stopwords = {
    'album', 'albums', 'song', 'songs', 'music', 'sound', 'track', 'tracks', 'record', 'records', 'single', 'singles',
    'artist', 'artists', 'band', 'bands', 'release', 'releases', 'released', 'make', 'makes', 'made', 'say', 'says',
    'put', 'puts', 'get', 'gets', 'got', 'go', 'goes', 'going', 'seem', 'seems', 'seemed', 'include', 'includes',
    'included', 'featuring', 'feature', 'features', 'featured', 'feel', 'feels', 'felt', 'keep', 'keeps', 'kept',
    'great', 'good', 'big', 'large', 'small', 'new', 'old', 'young', 'real', 'better', 'best', 'bad', 'worst',
    'major', 'minor', 'own', 'same', 'different', 'high', 'low', 'long', 'short', 'first', 'last', 'next', 'previous',
    'early', 'late', 'modern', 'year', 'years', 'time', 'times', 'day', 'days', 'week', 'weeks',
    'month', 'months', 'like', 'just', 'also', 'well', 'still', 'back', 'even', 'way', 'much', 'ever', 'never',
    'every', 'around', 'another', 'many', 'few', 'lots', 'lot', 'part', 'one', 'two', 'three', 'four', 'five',
    'six', 'seven', 'eight', 'nine', 'ten', 'several', 'various', 'whether', 'however', 'though', 'although',
    'fashioned', 'im'
}


extended_stopwords.update(additional_stopwords)

tfidf_vectorizer_reviews = TfidfVectorizer(max_features=7500, stop_words=stop_words, ngram_range=(1,2))
tfidf_matrix_reviews = tfidf_vectorizer_reviews.fit_transform(artist_df['Reviews'])

cosine_sim_reviews = cosine_similarity(tfidf_matrix_reviews, tfidf_matrix_reviews)
cosine_sim_df_reviews = pd.DataFrame(cosine_sim_reviews, index=artist_df['artist'], columns=artist_df['artist'])

# get similarity scores 
search_artist = 'Eminem'
similar_artists = cosine_sim_df_reviews[search_artist].sort_values(ascending=False)
print(similar_artists[1:])

artist
JAY-Z & T.I.            0.291308
JAY-Z                   0.291308
50 Cent                 0.282248
Kanye West              0.276717
The Notorious B.I.G.    0.262849
                          ...   
Klashnekoff             0.000000
King Chip               0.000000
Khia                    0.000000
Canibus                 0.000000
Meth  Ghost  Rae        0.000000
Name: Eminem, Length: 261, dtype: float64


In [None]:
"""

def get_top_words(artist_name, tfidf_matrix, tfidf_vectorizer, top_n=10):
    artist_idx = artist_df[artist_df['artist'] == artist_name].index[0]  # Get the index of the artist
    feature_names = tfidf_vectorizer.get_feature_names()
    tfidf_scores = tfidf_matrix[artist_idx].toarray().flatten()  # Convert to 1D array
    top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # Indices of top-n scores
    top_words = [(feature_names[i], tfidf_scores[i]) for i in top_indices]
    return top_words

# top 10 words for searched artist
top_words_artist = get_top_words(search_artist, tfidf_matrix_reviews, tfidf_vectorizer_reviews)

#number of similar artists to examine
num_artists = 5  # You can adjust this number

# Get top words for similar artists and compare with N.W.A's top words
nwa_words_set = set([word for word, _ in top_words_artist])
for artist, score in similar_artists[1:num_artists+1].iteritems():  # Skip the first one since it's N.W.A itself
    top_words_other = get_top_words(artist, tfidf_matrix_reviews, tfidf_vectorizer_reviews)
    other_words_set = set([word for word, _ in top_words_other])
    common_words = nwa_words_set.intersection(other_words_set)
    print(f"Common words between {search_artist} and {artist}: {common_words} (Similarity Score: {score:.4f})")

    """

In [393]:
#INCLUDING REVIEW DATA IN COMPOSITE VECTOR

#NORMALIZING AVERAGE PITCHFORK ALBUM REVIEW SCORES - feature in composite vector

review_scores_normalized = scaler.fit_transform(artist_df[['average_score']])
review_scores_normalized

array([[0.69587629],
       [0.65109536],
       [0.        ],
       [0.95876289],
       [0.        ],
       [0.        ],
       [0.84955904],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.67525773],
       [0.        ],
       [0.27319588],
       [0.6688094 ],
       [0.43298969],
       [0.60824742],
       [0.78092784],
       [0.77609536],
       [0.62113402],
       [0.66752577],
       [0.        ],
       [0.740476  ],
       [0.        ],
       [0.        ],
       [0.64690722],
       [0.        ],
       [0.7757732 ],
       [0.7757732 ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.63917526],
       [0.85953608],
       [0.65012493],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.57731959],
       [0.79381443],
       [0.71520619],
       [0.        ],
       [0.46262887],
       [0.62886598],
       [0.78068621],
       [0.        ],
       [0.   

In [408]:
# ARTIST SIMILARITY WITH COMPOSITE VECTOR - INCLUDING REVIEW DATA

# defining weights for features - can change this
weight_titles = 0.3 #since song titles did not give accurate results, weighted less
weight_lyrics = 1.5 #weighted more since it was more accurate
weight_views = 1.0 
weight_tags = 1.0
weight_score = 0.5
weight_reviews = 1.4 #captures intricate similarities between artists, so weighted higher

# applying weights to features
weighted_titles = tfidf_matrix_title.toarray() * weight_titles
weighted_lyrics = tfidf_matrix_lyrics.toarray() * weight_lyrics
weighted_views = view_counts_normalized * weight_views
weighted_tags = tags_transformed * weight_tags
weighted_score = review_scores_normalized * weight_score
weighted_reviews = tfidf_matrix_reviews.toarray() * weight_reviews

#MAKING THE COMPOSITE VECTOR WITH REVIEWS

features_combined_weighted_review = np.hstack((weighted_titles, weighted_lyrics, weighted_views, weighted_tags, weighted_score, weighted_reviews))

#COSINE SIMILARITY FOR COMPOSITE VECTOR

cosine_sim_weighted_review = cosine_similarity(features_combined_weighted_review, features_combined_weighted_review)
cosine_sim_df_weighted_review = pd.DataFrame(cosine_sim_weighted_review, index=artist_df['artist'], columns=artist_df['artist'])

In [409]:
# FINAL COSINE CHECK: GET SIMILAR ARTISTS TO A SPECIFIED ARTIST WITH REVIEW DATA

# checking Jay-Z
similar_artists_composite_review = cosine_sim_df_weighted_review['JAY-Z'].sort_values(ascending=False)
print(similar_artists_composite_review[1:])


artist
JAY-Z & T.I.            0.596374
Lil Wayne               0.551725
50 Cent                 0.549817
Eminem                  0.536221
The Notorious B.I.G.    0.529956
                          ...   
Miguel                  0.062562
Travie McCoy            0.060364
Robin Thicke            0.043381
Black Eyed Peas         0.039736
Tom Tom Club            0.018204
Name: JAY-Z, Length: 261, dtype: float64
