# Feature Engineered Data - Musify, A Music Recommendation System

### W207 Applied Machine Learning

#### Dhruvi Kothari, Sandeep Kataria, Phillip Ng

In [1]:
import numpy as np
import pandas as pd
import random
import os
import math as mt
import csv
from langid.langid import LanguageIdentifier, model
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data Load

folder = "../data/"
train = pd.read_csv(folder + 'train.csv')
test = pd.read_csv(folder + 'test.csv')
songs = pd.read_csv(folder + 'songs.csv')
#song_extra = pd.read_csv(folder + 'song_extra_info.csv')
members = pd.read_csv(folder + 'members.csv',parse_dates=['registration_init_time','expiration_date'])

## Songs Extra Info Feature Extraction (3 new features)
    1. song_country: Country where the song originated
    2. song_year: Year when the song published
    3. song_language: Language of the song (extracted using a python package langid: https://pypi.python.org/pypi/langid)

In [3]:
# Read Songs Extra Features extracted in 'songs_extra.csv'

songs_extra = pd.read_csv(folder + 'songs_extra.csv')
songs = songs.merge(songs_extra, how = 'left', on='song_id')
songs.drop(['language'], axis = 1, inplace = True)

## Songs Features Extraction (11 new features)
    1. genre_count: Count of genre for each song, except when null
    2. artist_count: Count of artists for each song, except when null
    3. lyricist_count: Count of lyricists for each song, except when null
    4. composer_count: Count of composers for each song, except when null
    5. is_featured: Indication whether song is featuring an artist or not
    6. song_member_count: Count of times a song is played by members
    7. member_song_count: Count of songs each member listened to
    8. artist_composer: Indication whether Artist and Composer are same for each song
    9. artist_composer_lyricist: Indication whether Artist, Composer and Lyricist are same for each song

In [4]:
# Feature Extraction Functions

def isrc_to_year(item):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

def isrc_country(item):
    if type(isrc) == str:
        return isrc[0:2]
    else:
        return np.nan

def findfeat(songs):
    featured = songs['artist_name'].str.find('feat')
    featured[featured == -1] = 0
    featured[featured > 0] = 1
    return featured

def feature_count(songs, song_cols):
    df = songs.copy()
    for col in song_cols:
        if col == 'genre_ids':
            df[col+'_len'] = df[col].str.split(r'|').apply(lambda x: len(x))
        else:
            df[col+'_len'] = df[col].str.split(r'[;|/\\,+、&]+').apply(lambda x: len(x))
    return df

def artist_composer_lyricist(songs):
    df = songs.copy()
    df['artist_composer'] = (songs['artist_name'] == songs['composer']).astype(np.int8)
    df['artist_composer_lyricist'] = ((songs['artist_name'] == songs['composer']) \
                                     & (songs['artist_name'] == songs['lyricist']) \
                                     & (songs['composer'] == songs['lyricist'])).astype(np.int8)
    return df

identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
def find_lang(item):
    if pd.isnull(item):
        return -1
    else:
        return identifier.classify(item)[0]

def extract_datetime(members, date_cols):
    df = members.copy()
    for column in date_cols:
        df[column[:3]+'_Year'] = df[column].apply(lambda x:int(str(x)[0:4]))
        df[column[:3]+'_Month'] = df[column].apply(lambda x:int(str(x)[5:7]))
        df[column[:3]+'_Day'] = df[column].apply(lambda x:int(str(x)[8:10]))
        df.drop(column,axis=1,inplace=True)
    return df

def apply_age(age):
    if age < 0:
        return 0
    elif age >= 80:
        return 80
    else:
        return age

def song_member_count(data):
    song_member_count = data.groupby('song_id')[['msno']].count()
    song_member_count.reset_index(inplace=True)
    song_member_count.rename(columns={'msno': 'song_member_count'}, inplace=True)
    return song_member_count

def member_song_count(data):
    member_song_count = data.groupby('msno')[['song_id']].count()
    member_song_count.reset_index(inplace=True)
    member_song_count.rename(columns={'song_id': 'member_song_count'}, inplace=True)
    return member_song_count

## Members Features Extraction (10 new features)
    1. member_age: Member's age, limited to range between 0 and 80
    2. reg_day: Day of the month when user registered
    3. reg_month: Month of the year when user registered
    4. reg_year: Year when user registered
    5. exp_day: Day of the month when user's membership expired
    6. exp_month: Month of the year when user's membership expired
    7. exp_year: year when user's membership expired

In [5]:
# Extract Member Features

#Replace Member's age
members['member_age'] = members.bd.apply(apply_age).astype(np.int8)
members.drop(['bd'], axis=1, inplace=True)

#Extract date features
date_cols = ['registration_init_time','expiration_date']
members = extract_datetime(members, date_cols)

In [6]:
#Merge Train/Test with songs data

train_songs = train.merge(songs, on='song_id', how='left')
test_songs = test.merge(songs, on='song_id', how='left')

#Merge Train/Test with members data
train_members = train_songs.merge(members, on='msno', how='left')
test_members = test_songs.merge(members, on='msno', how='left')

# Data Prep before feature extraction
train_X = train_members.fillna(-1)
test_X = test_members.fillna(-1)

In [7]:
#Label Encode categorical/object data

cols = list(train_X.columns)
cols.remove('target')

for col in cols:
    if train_X[col].dtype == 'object':
        train_X[col] = train_X[col].apply(str)
        test_X[col] = test_X[col].apply(str)
        le = LabelEncoder()
        train_vals = list(train_X[col].unique())
        test_vals = list(test_X[col].unique())
        le.fit(train_vals + test_vals)
        train_X[col] = le.transform(train_X[col])
        test_X[col] = le.transform(test_X[col])

In [8]:
#Split features and target variables

X = train_X.drop(['target'], axis=1)
y = train_X['target'].values
X_test = test_X.drop(['id'], axis=1)
ids = test_X['id'].values

In [9]:
#Prep songs data before feature extraction

def songs_data_prep(data, song_cols):
    df = data.copy()
    for col in song_cols:
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype(str)
    return df

song_cols = ['genre_ids', 'artist_name', 'composer', 'lyricist']

train_songs = songs_data_prep(train_songs, song_cols)
test_songs = songs_data_prep(test_songs, song_cols)

# Count features extraction
train_songs = feature_count(train_songs, song_cols)
test_songs = feature_count(test_songs, song_cols)

# Songs with similar artists
train_songs = artist_composer_lyricist(train_songs)
test_songs = artist_composer_lyricist(test_songs)

# Song featuring artists
train_songs['is_featured'] = findfeat(train_songs)
test_songs['is_featured'] = findfeat(test_songs)

# Merge Test Train for songs and members count features
merged_train_test = pd.concat((train_songs, test_songs))

#Counts of songs played by members
train_songs = train_songs.merge(song_member_count(merged_train_test), on='song_id', how='left')
test_songs = test_songs.merge(song_member_count(merged_train_test), on='song_id', how='left')

#Counts of songs each member played
train_songs = train_songs.merge(member_song_count(merged_train_test), on='msno', how='left')
test_songs = test_songs.merge(member_song_count(merged_train_test), on='msno', how='left')

#Replace irrelevant records that dont exist in training set
test_songs['source_screen_name'] = test_songs['source_screen_name'].replace(
    ['People local', 'People global'], np.nan)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [10]:
#Create dataframe with new song features

newfeatcol = ['genre_ids_len', 'artist_name_len', 'composer_len', 'lyricist_len', 'is_featured', 
              'artist_composer', 'artist_composer_lyricist', 'song_member_count', 'member_song_count']

train_newfeat,test_newfeat = train_songs[newfeatcol], test_songs[newfeatcol]

#Fill null values
train_newfeat.fillna(-1,inplace=True)
test_newfeat.fillna(-1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


## User Context Feature Extraction (46 new features)
    1. source_system_tab: 10 user - source_system_tab context features
    2. source_screen_name: 23 user - source_screen_name context features
    3. source_type: 13 user - source_type context features

In [13]:
#Extract user-context features

#merge train and test raw features
merged = pd.concat((train[['msno', 'song_id','source_type','source_screen_name','source_system_tab']],
                  test[['msno', 'song_id','source_type','source_screen_name','source_system_tab']]))

#drop duplicates
merged.drop_duplicates(subset=['msno', 'song_id'], keep="first", inplace=True)

#Calculate new features
newfeat = None
groups = merged.groupby('msno')

for feature in ['source_type','source_screen_name','source_system_tab']:
    valuecount = groups[feature].value_counts(normalize=True,dropna=False)
    valuecount = valuecount.unstack(level=-1).add_prefix(feature)
    if newfeat is None:
        newfeat = valuecount
    else:
        newfeat=newfeat.join(valuecount,how='left')
        
newfeat.reset_index(inplace=True)
train_context = train[['msno']].merge(newfeat,on='msno',how='left')
test_context = test[['msno']].merge(newfeat, on='msno', how='left')

#Drop msno column from train and test
train_context.drop('msno',axis=1,inplace=True)
test_context.drop('msno',axis=1,inplace=True)

# Fill nulls before feature extraction
train_context.fillna(0,inplace=True)
test_context.fillna(0,inplace=True)

# Merged dataframe with features
X_merged = pd.concat((pd.DataFrame(X), train_newfeat, train_context), axis=1)
X_test_merged = pd.concat((pd.DataFrame(X_test), test_newfeat, test_context), axis=1)

In [14]:
X_merged.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,lyricist,...,source_screen_nameUnknown,source_system_tabnan,source_system_tabdiscover,source_system_tabexplore,source_system_tablisten with,source_system_tabmy library,source_system_tabnotification,source_system_tabradio,source_system_tabsearch,source_system_tabsettings
0,9176,86884,2,8,7,206471.0,308,3785,16654,48,...,0.0,0.0,0.000146,0.168302,0.004834,0.166252,0.0,0.443826,0.21664,0.0
1,19273,260594,4,9,5,284584.0,98,36868,71,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
2,19273,140755,4,9,5,225396.0,98,24602,51541,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
3,19273,27577,4,9,5,255512.0,7,31652,41992,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
4,9176,38706,2,8,7,187802.0,3,5191,9702,48,...,0.0,0.0,0.000146,0.168302,0.004834,0.166252,0.0,0.443826,0.21664,0.0


In [16]:
X_merged.to_csv(folder+'Features.csv', index = False)

In [19]:
y = pd.DataFrame(y, columns = ['target'])

In [20]:
y.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [21]:
y.to_csv(folder+'targets.csv', index = False)

In [22]:
X_merged.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,song_length,genre_ids,artist_name,composer,lyricist,...,source_screen_nameUnknown,source_system_tabnan,source_system_tabdiscover,source_system_tabexplore,source_system_tablisten with,source_system_tabmy library,source_system_tabnotification,source_system_tabradio,source_system_tabsearch,source_system_tabsettings
0,9176,86884,2,8,7,206471.0,308,3785,16654,48,...,0.0,0.0,0.000146,0.168302,0.004834,0.166252,0.0,0.443826,0.21664,0.0
1,19273,260594,4,9,5,284584.0,98,36868,71,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
2,19273,140755,4,9,5,225396.0,98,24602,51541,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
3,19273,27577,4,9,5,255512.0,7,31652,41992,48,...,0.0,0.0,0.093023,0.0,0.0,0.847384,0.0,0.0,0.059593,0.0
4,9176,38706,2,8,7,187802.0,3,5191,9702,48,...,0.0,0.0,0.000146,0.168302,0.004834,0.166252,0.0,0.443826,0.21664,0.0


References: 
https://www.researchgate.net/publication/328838360_KKbox's_Music_Recommendation_Challenge_Solution_with_Feature_engineering_11th_ACM_International_Conference_on_Web_Search_and_Data_Mining_WSDM_2018_February_5-9_2018_Los_Angeles_California_USA_WSDM_Cup