#  Project Name: Music Recommendation System


###   1.Introduction
###   2.Libraries
###   3.File Path
###   4.Preprocessing
###   5.EDA
###   6.Modeling



# 1) Introduction

**THE AIM OF THE PROJECT:**

Is to predict the chances of a user listening to a song repetitively after the first observable listening event within a time window was triggered. If there are recurring listening event(s) triggered within a month after the user’s very first observable listening event, its target is marked 1, and 0 otherwise

# 2) Library

In [2]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import seaborn as sns

import warnings; 
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

# 3) File Path

In [3]:
train = pd.read_csv('train.csv', nrows = 100000)
songs = pd.read_csv('songs.csv')
members = pd.read_csv('members.csv')
song_extra_info = pd.read_csv('song_extra_info.csv')
test = pd.read_csv('test.csv')

#sample_submission = pd.read_csv('sample_submission.csv')
#merging 
train_s = pd.merge(train, songs,on='song_id',how='left')
train_extra = pd.merge(train_s,song_extra_info, on='song_id', how= 'left')

merged_df = pd.merge(train_extra, members, on='msno', how='left')

del train,songs, members, song_extra_info, train_s,train_extra

# 4) Preprocessing

In [4]:
merged_df

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,city,bd,gender,registered_via,registration_init_time,expiration_date
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,,52.0,Good Grief,GBUM71602854,1,0,,7,20120102,20171005
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,,,52.0,Lords of Cardboard,US3C69910183,13,24,female,9,20110525,20170911
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,,52.0,Hip Hop Is Dead(Album Version (Edited)),USUM70618761,13,24,female,9,20110525,20170911
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512.0,1019,Soundway,Kwadwo Donkoh,,-1.0,Disco Africa,GBUQH1000063,13,24,female,9,20110525,20170911
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802.0,1011,Brett Young,Brett Young| Kelly Archer| Justin Ebach,,52.0,Sleep Without You,QM3E21606003,1,0,,7,20120102,20171005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,EedNarOVNUU5ppYW/ho+KV8V29zuf7m3+Df5qnQ9FT4=,c7bvJHblBfQGFJG+DcltwclubmlYvIRTT964Lu4WWXI=,my library,Local playlist more,local-playlist,1,236982.0,465,2NE1,,,31.0,I Love You,TWA531200396,14,0,,9,20150321,20170923
99996,xyxz9lf8ipoxELeDLlazmHVZHE1ZYqE6iwfJWkH3Ojc=,MZTwICqr12ijCeBaWeKVJZ67Bz7NI0WA4bqB1IwR4jI=,my library,Local playlist more,local-library,0,228241.0,465,One Direction,Wayne Hector| John Ryan| Julian Bunetta| Ed Dr...,Wayne Hector| John Ryan| Julian Bunetta| Ed Dr...,52.0,Steal My Girl,GBHMU1400159,1,0,,7,20120117,20171004
99997,vDi/nHqBu7wb+DtI2Ix4TupWQatUEFR41mDC0c8Voh8=,xK0clTH8TvjG6sKmJXZuMLzhikhlEsolLtx2AHBpmbQ=,my library,Local playlist more,local-library,1,196812.0,465,Michael Giacchino,Mikkel Eriksen|Sia Furler|Tor Erik Hermansen,,52.0,Try Everything,USWD11575402,13,0,,9,20110322,20180403
99998,xyxz9lf8ipoxELeDLlazmHVZHE1ZYqE6iwfJWkH3Ojc=,SoPkcfpiBjDpZSNPN3HdbPI8WAkuenayPXyBRK91T2s=,my library,Local playlist more,local-library,0,215775.0,465,Katy Perry,Katy Perry| Max Martin| Jordan Houston| Lukasz...,,52.0,Dark Horse,USUM71311296,1,0,,7,20120117,20171004


In [5]:
merged_df['genre_ids'] = merged_df['genre_ids'].fillna('unknown').str.split('|').apply(lambda x: ','.join(map(str, x)))
merged_df['artist_name'] = merged_df['artist_name'].map(lambda x: x.lower() if isinstance(x, str) else 'unknown')
merged_df['composer'] = merged_df['composer'].map(lambda x: x.lower() if isinstance(x, str) else 'unknown')
merged_df['lyricist'] = merged_df['lyricist'].map(lambda x: x.lower() if isinstance(x, str) else 'unknown')
merged_df['song_length'] = merged_df['song_length'].fillna(0).astype(int)
merged_df['language'] = merged_df['language'].fillna(-1).astype(int)

In [6]:
for i in merged_df.select_dtypes(include=['object']).columns:
    merged_df[i][merged_df[i].isnull()] = 'unknown'
merged_df = merged_df.fillna(value=0)

In [7]:
merged_df.isnull().sum()

msno                      0
song_id                   0
source_system_tab         0
source_screen_name        0
source_type               0
target                    0
song_length               0
genre_ids                 0
artist_name               0
composer                  0
lyricist                  0
language                  0
name                      0
isrc                      0
city                      0
bd                        0
gender                    0
registered_via            0
registration_init_time    0
expiration_date           0
dtype: int64

In [8]:
#profile = ProfileReport(merged_df, title="Cars Listing Report")
#profile

In [9]:
merged_df['registration_init_time'] = pd.to_datetime(merged_df['registration_init_time'])
merged_df['registration_init_timeByMonth'] = merged_df['registration_init_time'].dt.month
merged_df['registration_init_timeByYear'] = merged_df['registration_init_time'].dt.year

merged_df['expiration_date'] = pd.to_datetime(merged_df['expiration_date'])
merged_df['expiration_dateByMonth'] = merged_df['expiration_date'].dt.month
merged_df['expiration_dateByYear'] = merged_df['expiration_date'].dt.year

# Encoding 


In [10]:
merged_df.dropna(axis=0, subset=['target'], inplace=True)
y = merged_df['target']
X = merged_df.drop(['target','registration_init_time','expiration_date'],axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=4)

for colname in X.select_dtypes("object"):
    X_train[colname], _ = X_train[colname].factorize()
    X_valid[colname], _ = X_valid[colname].factorize()


In [11]:
corr_matrix = merged_df.corr()
corr_values = corr_matrix['target']
print(corr_values)


target                           1.000000
song_length                     -0.000529
language                        -0.015208
city                             0.010351
bd                              -0.008666
registered_via                  -0.007355
registration_init_timeByMonth         NaN
registration_init_timeByYear          NaN
expiration_dateByMonth                NaN
expiration_dateByYear                 NaN
Name: target, dtype: float64


In [12]:
merged_df

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,city,bd,gender,registered_via,registration_init_time,expiration_date,registration_init_timeByMonth,registration_init_timeByYear,expiration_dateByMonth,expiration_dateByYear
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471,359,bastille,dan smith| mark crew,...,1,0,unknown,7,1970-01-01 00:00:00.020120102,1970-01-01 00:00:00.020171005,1,1970,1,1970
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584,1259,various artists,unknown,...,13,24,female,9,1970-01-01 00:00:00.020110525,1970-01-01 00:00:00.020170911,1,1970,1,1970
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396,1259,nas,n. jones、w. adams、j. lordan、d. ingle,...,13,24,female,9,1970-01-01 00:00:00.020110525,1970-01-01 00:00:00.020170911,1,1970,1,1970
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,255512,1019,soundway,kwadwo donkoh,...,13,24,female,9,1970-01-01 00:00:00.020110525,1970-01-01 00:00:00.020170911,1,1970,1,1970
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,187802,1011,brett young,brett young| kelly archer| justin ebach,...,1,0,unknown,7,1970-01-01 00:00:00.020120102,1970-01-01 00:00:00.020171005,1,1970,1,1970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,EedNarOVNUU5ppYW/ho+KV8V29zuf7m3+Df5qnQ9FT4=,c7bvJHblBfQGFJG+DcltwclubmlYvIRTT964Lu4WWXI=,my library,Local playlist more,local-playlist,1,236982,465,2ne1,unknown,...,14,0,unknown,9,1970-01-01 00:00:00.020150321,1970-01-01 00:00:00.020170923,1,1970,1,1970
99996,xyxz9lf8ipoxELeDLlazmHVZHE1ZYqE6iwfJWkH3Ojc=,MZTwICqr12ijCeBaWeKVJZ67Bz7NI0WA4bqB1IwR4jI=,my library,Local playlist more,local-library,0,228241,465,one direction,wayne hector| john ryan| julian bunetta| ed dr...,...,1,0,unknown,7,1970-01-01 00:00:00.020120117,1970-01-01 00:00:00.020171004,1,1970,1,1970
99997,vDi/nHqBu7wb+DtI2Ix4TupWQatUEFR41mDC0c8Voh8=,xK0clTH8TvjG6sKmJXZuMLzhikhlEsolLtx2AHBpmbQ=,my library,Local playlist more,local-library,1,196812,465,michael giacchino,mikkel eriksen|sia furler|tor erik hermansen,...,13,0,unknown,9,1970-01-01 00:00:00.020110322,1970-01-01 00:00:00.020180403,1,1970,1,1970
99998,xyxz9lf8ipoxELeDLlazmHVZHE1ZYqE6iwfJWkH3Ojc=,SoPkcfpiBjDpZSNPN3HdbPI8WAkuenayPXyBRK91T2s=,my library,Local playlist more,local-library,0,215775,465,katy perry,katy perry| max martin| jordan houston| lukasz...,...,1,0,unknown,7,1970-01-01 00:00:00.020120117,1970-01-01 00:00:00.020171004,1,1970,1,1970


In [None]:
plt.figure(figsize=[20,10])
sns.heatmap(df.corr(), annot=True)
plt.show()

# Modeling

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# create SVM classifier object
svm_classifier = SVC(kernel='linear')

# fit the model on training data
svm_classifier.fit(X_train, y_train)

# predict on validation data
y_pred = svm_classifier.predict(X_valid)

# calculate classification report
report = classification_report(y_valid, y_pred)

# print the report
print(report)
