In [212]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn import tree

import h2o
from h2o.frame import H2OFrame
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from __future__ import print_function
import graphviz
from collections import Counter
from sklearn.preprocessing import normalize

# Load Dataset

In [2]:
data = pd.read_json('../DS_Challenges/Song_Challenge/song_challenge.json')
data.head()

Unnamed: 0,id,user_id,user_state,user_sign_up_date,song_played,time_played
0,GOQMMKSQQH,122,Louisiana,2015-05-16,Hey Jude,2015-06-11 21:51:35
1,HWKKBQKNWI,3,Ohio,2015-05-01,We Can Work It Out,2015-06-06 16:49:19
2,DKQSXVNJDH,35,New Jersey,2015-05-04,Back In the U.S.S.R.,2015-06-14 02:11:29
3,HLHRIDQTUW,126,Illinois,2015-05-16,P.s. I Love You,2015-06-08 12:26:10
4,SUKJCSBCYW,6,New Jersey,2015-05-01,Sgt. Pepper's Lonely Hearts Club Band,2015-06-28 14:57:00


In [43]:
data['time_played'] = pd.to_datetime(data['time_played'])
data['user_sign_up_date'] = pd.to_datetime(data['user_sign_up_date'])

In [107]:
data = data.set_index('id')

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 4000 non-null   object        
 1   user_id            4000 non-null   int64         
 2   user_state         4000 non-null   object        
 3   user_sign_up_date  4000 non-null   datetime64[ns]
 4   song_played        4000 non-null   object        
 5   time_played        4000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(3)
memory usage: 187.6+ KB


In [4]:
len(data['user_id'].unique())

196

In [5]:
len(data['id'].unique())

4000

# Question 1

- What are the top 3 and the bottom 3 states in terms of number of users?

- Answer: 

    **Top 3:** New York, California, Texas
    
    **Bottom 3:** 'New Mexico', 'Idaho', 'North Dakota', 'Connecticut', 'Iowa', 'Rhode Island', 'Nebraska', 'Arizona', 'Kansas'

In [6]:
# top3
state_user_count = data.groupby('user_state')['user_id'].apply(lambda x:x.value_counts()).\
    reset_index().\
    rename(columns = {'user_id':'count'}).\
    rename(columns={'level_1':'user_id'}).\
    groupby('user_state').user_id.apply(lambda x:len(x)).\
    reset_index().\
    sort_values('user_id',ascending = False).\
    rename(columns = {'user_id':'number of users'})
state_user_count.head(3)

Unnamed: 0,user_state,number of users
25,New York,23
4,California,21
35,Texas,15


In [7]:
# bottom 3
state_user_count.tail(9)

Unnamed: 0,user_state,number of users
24,New Mexico,1
9,Idaho,1
27,North Dakota,1
6,Connecticut,1
12,Iowa,1
32,Rhode Island,1
22,Nebraska,1
2,Arizona,1
13,Kansas,1


# Question 2
- What are the top 3 and the bottom 3 states in terms of user engagement?

- Answer: 

    **Top 3**: Nebraska, Alaska, Mississippi
    
    **Bottom 3**: Minnesota, Virginia, Kansas	

Here, we define the user engagement is measured throught the average plays per user in the given state.

Of course, there are other definitions, such as "average play event per hour" (see [this link](https://github.com/stasi009/TakeHomeDataChallenges/blob/master/08.SongChallenge/song_challenge.ipynb) for details).

In [8]:
# create temporary dataframe for play count per state
state_play_count = data.groupby(['user_state','user_id']).agg({'song_played':'count'}).\
    reset_index().\
    groupby('user_state')['song_played'].\
    sum().\
    reset_index()

# merge user count and play count
state_user_play = state_play_count.merge(state_user_count, on = 'user_state')

# calculate average play
state_user_play['average_play'] = state_user_play['song_played']/state_user_play['number of users']

# sort according to average_play
state_user_play = state_user_play.sort_values(by = 'average_play',ascending=False)

In [9]:
state_user_play.head(3)

Unnamed: 0,user_state,song_played,number of users,average_play
22,Nebraska,36,1,36.0
1,Alaska,58,2,29.0
20,Mississippi,85,3,28.333333


In [10]:
# bottom 3
state_user_play.tail(3)

Unnamed: 0,user_state,song_played,number of users,average_play
19,Minnesota,42,4,10.5
37,Virginia,17,2,8.5
13,Kansas,8,1,8.0


# Question 3

- Give him a list of first-sign-up users in each state.

In [121]:
usersignup = data.groupby('user_state').agg({'user_sign_up_date':'min'}).reset_index()
userfirstsign = data[['user_state','user_id','user_sign_up_date']].\
                merge(usersignup,on=['user_state','user_sign_up_date'],how = 'right').\
                drop_duplicates()

In [124]:
userfirstsign.sort_values(by = 'user_sign_up_date',ascending=True)

Unnamed: 0,user_state,user_id,user_sign_up_date
0,Alabama,5,2015-05-01
902,Texas,7,2015-05-01
814,Oregon,1,2015-05-01
773,Ohio,3,2015-05-01
729,North Carolina,2,2015-05-01
607,New Jersey,6,2015-05-01
633,New Mexico,4,2015-05-01
477,Michigan,13,2015-05-02
496,Minnesota,8,2015-05-02
498,Minnesota,21,2015-05-02


# Question 4

- Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next 
- *(Song Recommendation Model)*

There should be a lot of different methods for this question. The simplest method is to choose the most popular songs. In addition, k-nearest neighbor (KNN) method can also be used. More advancely, collaborative filtering method can be implemented.

Here, I implement a simple version of `collaborative filtering` algorithm for song recommendation. More specifically, the similarity of two songs is calculate using the number of users whole listen them together.

In [130]:
data = data.reset_index()

### method 1: choose the most popular songs

In [168]:
## but this doesn't work well, since almost every user has read 'Come Together' before, 
## so this book has been served as the recommended book for many books
def song_recommendation(data, songname):
    """ function to recommend a song in terms of an input song name """
    tmp = data.loc[data['song_played']==songname]
    tmp = data.loc[data['song_played']==songname]
    userid = tmp['user_id'].to_list()
    song_recommend = data.loc[data['user_id'].\
                     isin(userid)].\
                     groupby('song_played')['id'].\
                     count().reset_index().\
                     sort_values(by = 'id',ascending = False).\
                     head(1)['song_played'].values[0]
    return song_recommend

In [170]:
song_recommendation(data,'ANYTIME AT ALL')

'Come Together'

In [173]:
print('input song\trecommened song')
print("*****************")
for song in data['song_played'].unique():
    recommend = song_recommendation(data,song)
    print(f'{song}\t{recommend}')

input song	recommened song
*****************
Hey Jude	Come Together
We Can Work It Out	Come Together
Back In the U.S.S.R.	Come Together
P.s. I Love You	Come Together
Sgt. Pepper's Lonely Hearts Club Band	Come Together
Sgt. Pepper Inner Groove	Revolution
Hello Goodbye	Come Together
Cry For A Shadow	Come Together
Revolution	Come Together
Let It Be	Come Together
I Feel Fine	Come Together
The Fool On The Hill	Come Together
Get Back	Come Together
Come Together	Come Together
She Loves You	Revolution
While My Guitar Gently Weeps	Come Together
Here Comes The Sun	Come Together
A Day In The Life	Come Together
Getting Better	Come Together
Baby You're A Rich Man	Come Together
The Ballad Of John And Yoko	Come Together
Lucy In The Sky With Diamonds	Come Together
Don't Let Me Down	Come Together
Reprise / Day in the Life	Come Together
Maxwell's Silver Hammer	Come Together
Across The Universe	Revolution
Ob-la-di, Ob-la-da	Come Together
Yesterday	Come Together
Fixing A Hole	Come Together
OH DARLING	Come

### method 2: collaborative filtering algorithm

In [207]:
# step 1: build the song-user matrix

def count_by_song(df):
    """ All data in df comes from the same song """
    return pd.Series(Counter(df.user_id))

song_user = data.groupby(['song_played']).apply(count_by_song).unstack().fillna(0).astype(int)

# each row is a song
# each column represents a user
# [i,j] represents number of times user 'j' plays song 'i'

song_user.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [218]:
# Step 2: build song-song similarity matrix

# normalize each song's vector to have unit norm
# which simplifies 'cosine similarity' to dot-product of two vectors
song_user_norm = normalize(song_user, axis=1)

# songs_similarity is a [S,S] matrix, where 'S' is #songs
# the higher songs_similarity[i,j] indicates the more similar between song[i] and song[j]
similarity = np.dot(song_user_norm,song_user_norm.T)

# transform np.ndarray to pd.DataFrame
similarity_df = pd.DataFrame(similarity, index=song_user.index, columns=song_user.index)
similarity_df

song_played,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,Baby You're A Rich Man,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,1.000000,0.246021,0.067651,0.108483,0.192538,0.341760,0.322896,0.138092,0.223237,0.175762,...,0.462712,0.055237,0.509397,0.225503,0.427027,0.033826,0.331593,0.339179,0.079727,0.0
A Hard Day's Night,0.246021,1.000000,0.000000,0.000000,0.100000,0.136931,0.111803,0.000000,0.000000,0.091287,...,0.259548,0.129099,0.210099,0.000000,0.112987,0.000000,0.050000,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.067651,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.109435,0.000000,0.000000,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
ANYTIME AT ALL,0.108483,0.000000,0.000000,1.000000,0.000000,0.154303,0.094491,0.109109,0.000000,0.000000,...,0.116991,0.000000,0.138107,0.089087,0.190982,0.000000,0.000000,0.146845,0.000000,0.0
Across The Universe,0.192538,0.100000,0.000000,0.000000,1.000000,0.091287,0.000000,0.000000,0.000000,0.000000,...,0.138426,0.000000,0.116722,0.000000,0.075324,0.000000,0.000000,0.043437,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YOUR MOTHER SHOULD KNOW,0.033826,0.000000,0.000000,0.000000,0.000000,0.144338,0.088388,0.102062,0.000000,0.000000,...,0.164153,0.204124,0.092277,0.000000,0.208422,1.000000,0.079057,0.068680,0.000000,0.0
Yellow Submarine,0.331593,0.050000,0.000000,0.000000,0.000000,0.273861,0.111803,0.258199,0.000000,0.182574,...,0.224942,0.000000,0.326821,0.158114,0.244804,0.079057,1.000000,0.195468,0.074536,0.0
Yesterday,0.339179,0.195468,0.000000,0.146845,0.043437,0.178437,0.291386,0.112154,0.064752,0.000000,...,0.413382,0.168232,0.436026,0.274721,0.417165,0.068680,0.195468,1.000000,0.259010,0.0
You Never Give Me Your Money,0.079727,0.074536,0.000000,0.000000,0.000000,0.068041,0.166667,0.000000,0.111111,0.000000,...,0.180559,0.000000,0.260998,0.078567,0.168430,0.000000,0.074536,0.259010,1.000000,0.0


In [219]:
similarity_df.loc['A Day In The Life'].sort_values(ascending = False)[1:2].reset_index()

Unnamed: 0,song_played,A Day In The Life
0,Revolution,0.594266


In [223]:
# Step 3: find the top-k most similar songs

def find_topk(similarity, song_name, k=1):
    tmp = similarity.loc[song_name].sort_values(ascending = False)[1:k+1].reset_index()
    tmp = tmp.rename(columns = {'song_played':'song_recommended','A Day In The Life':'similarity'})
    return tmp

In [226]:
# an example of k=10, song_name = 'A Day In The Life'
find_topk(similarity = similarity_df, song_name = 'A Day In The Life', k=10)

Unnamed: 0,song_recommended,similarity
0,Revolution,0.594266
1,Come Together,0.576682
2,Get Back,0.552443
3,While My Guitar Gently Weeps,0.509397
4,Back In the U.S.S.R.,0.497328
5,Let It Be,0.496184
6,Here Comes The Sun,0.493598
7,Hello Goodbye,0.491431
8,Hey Jude,0.490221
9,Lucy In The Sky With Diamonds,0.485512


In [253]:
# step 4 find top K most similar of each song
def most_similar_songs(s,topk):
    # [0] must be itself
    similar_ones = s.sort_values(ascending=False)[1:topk+1].index.values
    return pd.Series(similar_ones,index = ["similar#{}".format(i) for i in range(1,topk+1)])

pd.set_option('display.max_rows',100)
similarity_df.apply(most_similar_songs,topk=2,axis=1)


Unnamed: 0_level_0,similar#1,similar#2
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1
A Day In The Life,Revolution,Come Together
A Hard Day's Night,Come Together,Let It Be
A Saturday Club Xmas/Crimble Medley,GIRL,IT WON'T BE LONG
ANYTIME AT ALL,Can't Buy Me Love,Come Together
Across The Universe,Revolution,Birthday
All My Loving,Let It Be,Hey Jude
All You Need Is Love,A Day In The Life,While My Guitar Gently Weeps
And Your Bird Can Sing,All My Loving,IN MY LIFE
BAD BOY,Hey Jude,OH DARLING
Baby You're A Rich Man,Back In the U.S.S.R.,Come Together


# Question 5

- How would you set up a test to check whether your model works well and is improving engagement?

**Answer:**

We need to perform a A/B test ([reference](https://github.com/stasi009/TakeHomeDataChallenges/blob/master/08.SongChallenge/song_challenge.ipynb)):

* randomly split users into two groups, one Control group and one Experiment group
* Control group has no recommendation strategy
* Experiment group recommend the next song
* after running some time, perform a one-tailed t-test on 'average #play per user'
    - $H_0$: population 'average #play per user' is same in two groups
    - $H_a$: experiment group's population 'average #play per user' is higher than control group's