**Project Summary**

Developed a content-based recommendation system using TF-IDF on song data, exploring missing values, and generating a similarity matrix to recommend songs based on input.

# **Import and read data**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import glob
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from zipfile import ZipFile

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [None]:
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

pd.options.display.float_format = '{:.7f}'.format

pd.options.display.max_rows = None

In [None]:
songs = pd.read_csv('/content/songs.csv',encoding='latin')
info = pd.read_csv('/content/song_extra_info.csv',encoding='latin')

In [None]:
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,å¼µä¿¡å² (Jeff Chang),è£è²,ä½åå¼,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,æ¹¯å°åº·,å¾ä¸ç,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,è²´æç²¾é¸,Traditional,Traditional,52.0


In [None]:
info.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,æå,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,åè«æ,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,ææç¾ç¶²,TWA471306001


In [None]:
songs.shape

(84497, 7)

In [None]:
info.shape

(109238, 3)

In [None]:
print(songs.columns)
print('===============================================')
print(info.columns)

Index(['song_id', 'song_length', 'genre_ids', 'artist_name', 'composer',
       'lyricist', 'language'],
      dtype='object')
Index(['song_id', 'name', 'isrc'], dtype='object')


In [None]:
df = info.merge(songs,on='song_id')

In [None]:
print(df.columns)

Index(['song_id', 'name', 'isrc', 'song_length', 'genre_ids', 'artist_name',
       'composer', 'lyricist', 'language'],
      dtype='object')


In [None]:
df_composer = df.copy()

In [None]:
df_composer = df_composer.drop(df_composer.columns.difference(['song_id','name','composer']),axis =1)

In [None]:
df_composer.shape

(15428, 3)

In [None]:
df_composer.dtypes

song_id     object
name        object
composer    object
dtype: object

In [None]:
df_composer.head(4)

Unnamed: 0,song_id,name,composer
0,hKs8gwK7qKNeXTvzZ3U8y8aDiBsE0y7uksji0TPbnrQ=,åæªæ¢,é³çå®
1,BwDDym6Tp7EnGudWBGJyjONZeX0/4ndu5PETiPWzav8=,é¢å¿å (Centrifugal Force),é»å»ºçº
2,L4OV+X+SVJgMYwreCNa4LJRHhkRTR+TT/C+n1h8lbZ8=,å¨æ¨å¤©ç©º,Huo Xing Dian Tai
3,Jpm8pJ/TPm4bRH29ON0Ok56M1dhmlJJ6fAjaM8YW42E=,Baby Don't Cry,


In [None]:
print('Number of missing values across columns-\n', df_composer.isnull().sum())

Number of missing values across columns-
 song_id        0
name           0
composer    6330
dtype: int64


In [None]:
df_composer.dropna(inplace=True)

In [None]:
df_composer.isnull().sum()

song_id     0
name        0
composer    0
dtype: int64

In [None]:
df_sampled = df_composer.sample(n=9000,random_state=98)

In [None]:
df_sampled.head()

Unnamed: 0,song_id,name,composer
6845,2tBSWMPh2oaG1Qff7thVTilFKD5EPP5pSY8dQFLDiIc=,æ¥åºåå½©,ç«¹éä¹
9974,gkv7VOXdotRToNt+QAMwbqOr8+lDFMWgyNryNK9POg0=,Sushi Battle,Takashi Ohmama
8914,aMtWPFvXLHHmlQDBUCrOrKpC8moGamOTstjuU57GekA=,BANG! [EDX's Ibiza Sunrise Remix],Sandy Rivera| April Morgan
11202,iwmm/+JY9MJWkI1/vKjtt6zSjQKZmTpelyQ2bAOV0zA=,Happy Endings,Nick Wheeler| Tyson Ritter
5729,xrAcPoNtGNFqt53num+IXAudnjJM5g9wGqLKuNKS+5s=,å§æäºº,å»ä¸å


In [None]:
df_sampled.shape

(9000, 3)

# **Content based recommendation system**

In [None]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_sampled['composer'])

In [None]:
tfidf_matrix.shape

(9000, 22921)

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

(9000, 9000)

In [None]:
df_sampled.head()

Unnamed: 0,song_id,name,composer
6845,2tBSWMPh2oaG1Qff7thVTilFKD5EPP5pSY8dQFLDiIc=,æ¥åºåå½©,ç«¹éä¹
9974,gkv7VOXdotRToNt+QAMwbqOr8+lDFMWgyNryNK9POg0=,Sushi Battle,Takashi Ohmama
8914,aMtWPFvXLHHmlQDBUCrOrKpC8moGamOTstjuU57GekA=,BANG! [EDX's Ibiza Sunrise Remix],Sandy Rivera| April Morgan
11202,iwmm/+JY9MJWkI1/vKjtt6zSjQKZmTpelyQ2bAOV0zA=,Happy Endings,Nick Wheeler| Tyson Ritter
5729,xrAcPoNtGNFqt53num+IXAudnjJM5g9wGqLKuNKS+5s=,å§æäºº,å»ä¸å


In [None]:
df_sampled = df_sampled.reset_index()

In [None]:
df_sampled.head()

Unnamed: 0,index,song_id,name,composer
0,6845,2tBSWMPh2oaG1Qff7thVTilFKD5EPP5pSY8dQFLDiIc=,æ¥åºåå½©,ç«¹éä¹
1,9974,gkv7VOXdotRToNt+QAMwbqOr8+lDFMWgyNryNK9POg0=,Sushi Battle,Takashi Ohmama
2,8914,aMtWPFvXLHHmlQDBUCrOrKpC8moGamOTstjuU57GekA=,BANG! [EDX's Ibiza Sunrise Remix],Sandy Rivera| April Morgan
3,11202,iwmm/+JY9MJWkI1/vKjtt6zSjQKZmTpelyQ2bAOV0zA=,Happy Endings,Nick Wheeler| Tyson Ritter
4,5729,xrAcPoNtGNFqt53num+IXAudnjJM5g9wGqLKuNKS+5s=,å§æäºº,å»ä¸å


In [None]:
titles = df_sampled['name']
indices = pd.Series(df_sampled.index, index=df_sampled['name'])

In [None]:
indices.head()

name
æ¥åºå
å½©                         0
Sushi Battle                         1
BANG! [EDX's Ibiza Sunrise Remix]    2
Happy Endings                        3
å§æ
äºº                            4
dtype: int64

In [None]:
def get_recommendations(Name):
    idx = indices[Name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    music_indices = [i[0] for i in sim_scores]
    return titles.iloc[music_indices]

In [None]:
get_recommendations('Another Day In Paradise').head(5)

4672    No Way Out (Theme From Brother Bear)
5857                          One More Night
8316                 Another Day In Paradise
4680                               éè¨å¾
1544                              High Atlas
Name: name, dtype: object

In [None]:
get_recommendations('Picasso Baby').head(10)

127            Gimme What I Don't Know (I Want)
403                       Let the Groove Get In
6563    Carry Out (Featuring Justin Timberlake)
7498                            Not a Bad Thing
3212                                      Rehab
3571           Borderline (An Ode to Self Care)
3877                                   Spectrum
3385                                Movin' Bass
140                                  Over There
444                                     Opening
Name: name, dtype: object