## Music Recommendation System (Data Processing and Analysis)

### Framing the Problem

This project is aimed upon building a music recommendation system that gives the user recommendations on music based on his music taste by analysing his previously heard music and playlist. This project is done in two ways, using 'User - to - User Recommendation' and 'Item - to - Item Recommendation'. Birch, MiniBatchKMeans and KMeans algorithms are being used along with 'Surprise' module to compute the similarity between recommendations and user's already existing playlist for evaluation

### Obtaining Data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
import missingno as ms
%matplotlib inline

In [9]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 2000)

In [13]:
curr_dir = os.path.join(os.getcwd(),"datasets","raw","fma_metadata")
print(curr_dir)

/home/gokulkurup/root/MusicRecomendationSystem/datasets/raw


In [14]:
echonest = pd.read_csv(os.path.join(curr_dir,'echonest.csv'))
features = pd.read_csv(os.path.join(curr_dir,'features.csv'))
genres = pd.read_csv(os.path.join(curr_dir,'genres.csv'))
tracks = pd.read_csv(os.path.join(curr_dir,'tracks.csv'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/gokulkurup/root/MusicRecomendationSystem/datasets/raw/echonest.csv'

### Working with 'Echonest' dataset

#### Analysing Data

In [2]:
echonest.info()

NameError: name 'echonest' is not defined

In [None]:
features.info()

In [None]:
genres.info()

In [None]:
tracks.info()

In [None]:
echonest.head(10)

#### Feature Engineering

In [None]:
ms.matrix(echonest)

In [None]:
echonest.drop(['echonest.8', 'echonest.9', 'echonest.15', 'echonest.16', 'echonest.17', 'echonest.18', 'echonest.19'], axis=1, inplace=True)

In [None]:
echonest.tail(15)

In [None]:
ms.matrix(echonest.iloc[:, 0:15])

In [None]:
echonest.drop(['echonest.10', 'echonest.11', 'echonest.12'], axis=1, inplace=True)

In [None]:
ms.matrix(echonest)

In [None]:
echonest.head(10)

In [None]:
echonest.drop(0, axis=0, inplace=True)

In [None]:
echonest.iloc[0, 0]

In [None]:
echonest.iloc[1, 0]

In [None]:
echonest.iloc[0, 0] = echonest.iloc[1, 0]

In [None]:
echonest.head()

In [None]:
echonest.drop(2, axis=0, inplace=True)

In [None]:
echonest.columns = echonest.iloc[0]

In [None]:
echonest.head()

In [None]:
echonest.drop(1, axis=0, inplace=True)

In [None]:
echonest.head()

In [None]:
echonest.reset_index(inplace=True)

In [None]:
echonest.drop('index', inplace=True, axis=1)

In [None]:
echonest.head()

In [None]:
type(echonest['acousticness'][0])

In [None]:
def convert_to_float(df, columns):
    for i in columns:
        df[i] = df[i].astype('float')
    return df

In [None]:
echonest = convert_to_float(echonest, set(echonest.columns) - set(['track_id', 'artist_name', 'release']))

In [None]:
echonest.head()

In [None]:
echonest.info()

### Working with 'Features' dataset

#### Analysing Data

In [None]:
features.info()

In [None]:
features.head(10)

In [None]:
ms.matrix(features.iloc[:, 21:40])

#### Feature Engineering

In [None]:
features.iloc[0,0] = features.iloc[2, 0]

In [None]:
features.head(3)

In [None]:
features.drop(2, inplace=True)

In [None]:
len(features.columns)

In [None]:
len(features.iloc[0])

In [None]:
def combine_two_rows(df):
    columns = list(df.columns)
    for i in range(0, 519):
        columns[i] = columns[i] + " " + df.iloc[0, i]
    return columns

In [None]:
features.columns = combine_two_rows(features)

In [None]:
features.drop([0, 1], inplace=True)

In [None]:
features.reset_index(inplace=True)

In [None]:
features.drop('index', axis=1, inplace=True)

In [None]:
features.head()

In [None]:
features = features.astype(dtype='float')
features['feature track_id'] = features['feature track_id'].astype('int')

In [None]:
ms.matrix(features)

In [None]:
features.head(3)

### Working with 'Tracks' dataset

#### Analysing Data

In [None]:
tracks.info()

In [None]:
tracks.head()

In [None]:
tracks.iloc[0,0] = tracks.iloc[1, 0]

In [None]:
tracks.drop(1, axis=0, inplace=True)

In [None]:
tracks.head()

#### Feature Engineering

In [None]:
len(tracks.columns)

In [None]:
def combine_one_row(df):
    columns = list(df.columns)
    for i in range(0, 53):
        if i == 0:
            columns[i] = df.iloc[0, i]
        else:
            columns[i] = columns[i] + " " + df.iloc[0, i]
    return columns

In [None]:
tracks.columns = combine_one_row(tracks)

In [None]:
tracks.drop(0, inplace=True)

In [None]:
tracks.reset_index(inplace=True)

In [None]:
tracks.drop(['index'], axis=1, inplace=True)

In [None]:
ms.matrix(tracks.iloc[0: 10])

In [None]:
tracks.head()

In [None]:
tracks['track.7 genre_top'].value_counts()

In [None]:
track_title = pd.DataFrame(tracks['track.19 title'])

In [None]:
track_title['track_id'] = tracks['track_id']

In [None]:
track_title.head()

In [None]:
track_title.tail()

In [None]:
track_title.shape

In [None]:
tracks.drop(['album comments','album.1 date_created', 
             'album.2 date_released', 'album.11 tracks', 
             'album.9 tags', 'album.8 producer', 'album.3 engineer', 'album.6 information',
             'artist active_year_begin', 'artist.1 active_year_end', 'artist.2 associated_labels',
             'artist.3 bio','artist.4 comments','artist.5 date_created', 'artist.7 id',
             'artist.8 latitude','artist.9 location','artist.10 longitude', 'artist.11 members',
             'artist.13 related_projects', 'artist.14 tags','artist.15 website','artist.16 wikipedia_page',
             'set.1 subset', 'track.1 comments', 'track.2 composer', 'track.3 date_created', 'track.4 date_recorded',
             'track.10 information', 'track.13 license', 'track.15 lyricist', 'track.17 publisher', 'track.18 tags',
             'track.19 title'], axis=1, inplace=True)

In [None]:
tracks.info()

In [None]:
ms.matrix(tracks)

In [None]:
tracks['album.12 type'].value_counts()

In [None]:
tracks['album.10 title'].value_counts()

In [None]:
tracks['album.10 title'].fillna(method='ffill', inplace=True)

In [None]:
tracks.drop(['track.12 language_code', 'album.12 type'], axis=1, inplace=True)

In [None]:
tracks.drop('track.9 genres_all', axis=1, inplace=True)

In [None]:
ms.matrix(tracks)

In [None]:
tracks['track.8 genres'].unique()

In [None]:
genres.info()

In [None]:
type(tracks['track.7 genre_top'].iloc[27])

In [None]:
def getList(cd):
    return cd[1:-1].split(',')

In [None]:
for i in range(0, 106574):
    if type(tracks['track.7 genre_top'][i]) == float:
        genre_list = getList(str(tracks['track.8 genres'][i]))
        count = len(genre_list)
        title = ""
        for j in range(0, count):
            title = title + str(genres['title'][j]) + str('|')
        tracks['track.7 genre_top'][i] = title

### Working with 'Genre' dataset

#### Analysing Data

In [None]:
genres.info()

In [None]:
ms.matrix(genres)

In [None]:
genres.head()

#### Feature Engineering

Nothing to engineer!

### Combining all datasets into a single entity

#### Analysing Data

In [None]:
echonest.info()

In [None]:
tracks.info()

In [None]:
tracks.head()

In [None]:
echonest.head()

In [None]:
genres.info()

In [None]:
features.info()

#### Feature Engineering

In [None]:
features.columns = ['track_id'] + list(features.columns[1:])

In [None]:
features.head()

In [None]:
type(echonest['track_id'].iloc[0])

In [None]:
echonest['track_id'] = echonest['track_id'].astype('int')
tracks['track_id'] = tracks['track_id'].astype('int')

In [None]:
features.sort_values(by='track_id', inplace=True)
tracks.sort_values(by='track_id', inplace=True)
echonest.sort_values(by='track_id', inplace=True)

In [None]:
features.head()

In [None]:
tracks.head()

In [None]:
count = 0
for i in range(0, 106574):
    if features['track_id'][i] == tracks['track_id'][i]:
        count += 1
    else:
        print(features['track_id'][i], tracks['track_id'][i])

In [None]:
final = pd.concat([features, tracks.drop('track_id', axis=1)], axis=1)

In [None]:
final.shape

In [None]:
final.head()

In [None]:
echonest.tail(3)

In [None]:
echonest.drop(['artist_name', 'release'], axis=1, inplace=True)

In [None]:
tracks.tail(3)

In [None]:
features.head(1)

In [None]:
final = echonest.merge(final, on='track_id')

In [None]:
final.shape

In [None]:
ms.matrix(final)

### Analysing Data

In [None]:
final.head()

In [None]:
final.shape

In [None]:
final.info()

In [None]:
final.drop('track.8 genres', axis=1, inplace=True)

In [None]:
final.shape

In [None]:
final.head()

In [None]:
final['track.7 genre_top'].value_counts()

### Feature Engineering

In [None]:
def format_strings(x):
    if '-' in x:
        return ''.join(x.split('-'))
    if x.find('/'):
        return '|'.join(x.split('/'))
    return x

In [None]:
def modifyString(serie, val):
    for i in range(0, val):
        if serie[i] == 'Old-Time / Historic':
            serie[i] = 'OldTime|Historic'
    return serie

In [None]:
final['track.7 genre_top'] = modifyString(final['track.7 genre_top'], 13129)

In [None]:
final['track.7 genre_top'] = final['track.7 genre_top'].apply(format_strings)

In [None]:
final['track.7 genre_top'].value_counts()

In [None]:
final.head()

In [None]:
metadata = pd.DataFrame()

In [None]:
metadata['track_id'] = final['track_id']

In [None]:
metadata.shape

In [None]:
track_title.shape

In [None]:
track_title = track_title.set_index('track_id')

In [None]:
track_title.head()

In [None]:
track_title.index = [int(i) for i in track_title.index]

In [None]:
track_title.head()

In [None]:
metadata.head()

In [None]:
metadata['album_title'] = final['album.10 title']

In [None]:
metadata['artist_name'] = final['artist.12 name']

In [None]:
metadata['genre'] = final['track.7 genre_top']

In [None]:
metadata = metadata.set_index('track_id')

In [None]:
metadata.tail()

In [None]:
metadata.head()

In [None]:
metadata['track_title'] = track_title.loc[metadata.index]['track.19 title']

In [None]:
metadata.tail()

In [None]:
metadata.head()

In [None]:
len(metadata[metadata['genre'].isnull()])

In [None]:
final.drop('album.10 title', axis=1, inplace=True)

In [None]:
final.head()

In [None]:
final.info()

In [None]:
final.drop('artist.12 name', axis=1, inplace=True)

In [None]:
final.info()

In [None]:
final.head()

In [None]:
k = final # Restore point # Removed Label Encoding

In [None]:
final.head()

In [None]:
final.drop('set split', axis=1, inplace=True)

In [None]:
final.info()

In [None]:
final.info()

In [None]:
genres['title'].count()

In [None]:
genre_dummy = pd.DataFrame(data= np.zeros((13129, 163)), columns= list(genres['title'].unique()))

In [None]:
genre_dummy.head()

In [None]:
genre_list = pd.Series(data= genre_dummy.columns)

In [None]:
genre_list = modifyString(genre_list, 163)

In [None]:
genre_list = genre_list.apply(format_strings)

In [None]:
genre_dummy.columns= genre_list

In [None]:
# columns converted successfully

In [None]:
genre_list = list(genre_list)

In [None]:
final

In [None]:
for i in range(0, 13129):
    if '|' in final['track.7 genre_top'][i]:
        divided_list = str(final['track.7 genre_top'][i]).split('|')
        count = len(divided_list)
        for j in range(0, count):
            if divided_list[j] in genre_list:
                location = genre_list.index(divided_list[j])
                genre_dummy.iloc[i, location] = 1
    else:
        location = genre_list.index(final['track.7 genre_top'][i])
        genre_dummy.iloc[i, location] = 1

In [None]:
genre_list.index(final['track.7 genre_top'][0])

In [None]:
final.drop(['track.7 genre_top'], axis= 1, inplace= True)

In [None]:
final = pd.concat([final, genre_dummy], axis= 1)

In [None]:
final.head()

### Writing final data to .csv files

In [None]:
import os

if not os.path.isdir(os.path.join('datasets','final')):
    os.makedirs(os.path.join('datasets','final'))
    
metadata.to_csv('datasets/final/metadata.csv')
final.to_csv('datasets/final/final.csv')