In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
song_data = pd.read_json('song.json')
song_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 4000 non-null   object
 1   user_id            4000 non-null   int64 
 2   user_state         4000 non-null   object
 3   user_sign_up_date  4000 non-null   object
 4   song_played        4000 non-null   object
 5   time_played        4000 non-null   object
dtypes: int64(1), object(5)
memory usage: 187.6+ KB


In [6]:
song_data.head(n=4)

Unnamed: 0,id,user_id,user_state,user_sign_up_date,song_played,time_played
0,GOQMMKSQQH,122,Louisiana,2015-05-16,Hey Jude,2015-06-11 21:51:35
1,HWKKBQKNWI,3,Ohio,2015-05-01,We Can Work It Out,2015-06-06 16:49:19
2,DKQSXVNJDH,35,New Jersey,2015-05-04,Back In the U.S.S.R.,2015-06-14 02:11:29
3,HLHRIDQTUW,126,Illinois,2015-05-16,P.s. I Love You,2015-06-08 12:26:10


In [7]:
song_data['user_sign_up_date'] = pd.to_datetime(song_data['user_sign_up_date'])
song_data.time_played = pd.to_datetime(song_data.time_played)

In [9]:
song_data.describe(include = 'all', datetime_is_numeric=True)

Unnamed: 0,id,user_id,user_state,user_sign_up_date,song_played,time_played
count,4000,4000.0,4000,4000,4000,4000
unique,4000,,41,,97,
top,MEXSUYUFRT,,New York,,Come Together,
freq,1,,469,,408,
mean,,101.574,,2015-05-11 17:06:00,,2015-06-15 00:48:44.929500160
min,,1.0,,2015-05-01 00:00:00,,2015-06-01 05:02:54
25%,,48.0,,2015-05-06 00:00:00,,2015-06-07 20:02:02
50%,,102.0,,2015-05-12 00:00:00,,2015-06-14 20:04:12.500000
75%,,155.0,,2015-05-19 00:00:00,,2015-06-21 21:36:30.249999872
max,,200.0,,2015-05-20 00:00:00,,2015-06-28 23:46:06


The company CEO asked you for very specific questions:

- What are the top 3 and the bottom 3 states in terms number of users?


- What are the top 3 and the bottom 3 states in terms of user engagement? You can choose how to mathematically define user engagement. What the CEO cares about here is in which states users are using the product a lot/very little.


- The CEO wants to send a gift to the first user who signed-up for each state. That is, the first user who signed-up from California, from Oregon, etc. Can you give him a list of those users?


- Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next.
That is, if, for instance, a user is currently listening to “Eight Days A Week“, which song has the highest probability of being played right after it by the same user? This is going to be V1 of a song recommendation model.


- How would you set up a test to check whether your model works well?



### What are the top 3 and the bottom 3 states in terms number of users?

In [47]:
g = song_data.groupby('user_state').user_id.agg(lambda x: len(np.unique(x)))
g = g.sort_values(ascending = False)

In [53]:
#top states in terms of user sign up
g[:5]

user_state
New York        23
California      21
Texas           15
Pennsylvania     9
Ohio             9
Name: user_id, dtype: int64

In [28]:
g.iloc[:3, :]['user_state']

0         Kansas
1    Connecticut
2     New Mexico
Name: user_state, dtype: object

In [34]:
g.iloc[-4:, :]

Unnamed: 0,user_state,0
37,Virginia,17
38,New Mexico,17
39,Connecticut,16
40,Kansas,8


In [52]:
#bottom states in terms of user sign up
g[-10:]

user_state
Alaska          2
North Dakota    1
Kansas          1
Iowa            1
Rhode Island    1
Nebraska        1
Idaho           1
Connecticut     1
New Mexico      1
Arizona         1
Name: user_id, dtype: int64

### What are the top 3 and the bottom 3 states in terms of user engagement? You can choose how to mathematically define user engagement. What the CEO cares about here is in which states users are using the product a lot/very little.


User engagement can be defined as no of time a song is played by the user or in terms of time played

In [76]:
g = song_data.groupby(['user_state', 'user_id']).time_played.agg(['min', 'max'])
g.drop(['level_0', 'index'], axis = 1, inplace = True)
g['duration_played'] = g['max']-g['min']

In [84]:
f = g.groupby('user_state').duration_played.sum()/g.groupby('user_state').duration_played.count()
f = f.reset_index()
f = f.sort_values(by='duration_played', ascending = False)

In [99]:
f[:3], f[-3:]

(      user_state  duration_played
 27  North Dakota 27 days 09:12:40
 24    New Mexico 27 days 07:53:28
 2        Arizona 27 days 06:18:28,
        user_state  duration_played
 39  West Virginia 17 days 06:48:28
 19      Minnesota 13 days 10:20:07
 37       Virginia  8 days 23:59:20)

second definition can be on an average how many songs are played in every state

We can extend it to on an average how many songs are played in an hour in each state.

In [112]:
freq_df = song_data.groupby(['user_state', 'song_played']).time_played.count().reset_index()

In [127]:
f = freq_df.groupby('user_state').time_played.sum()/freq_df.groupby('user_state').time_played.count()
f = f.reset_index()
f = f.sort_values(by = 'time_played', ascending = False)

In [128]:
f.head()

Unnamed: 0,user_state,time_played
25,New York,6.424658
4,California,5.985915
28,Ohio,3.8
7,Florida,3.673469
35,Texas,3.538462


#### Or, we can define user engagement by how many songs are played on an average in an hour/minute/day

In [181]:
#extension
freq_df_ext = song_data.groupby(['user_state', 'song_played']).time_played.agg(['min', 'max', 'count']).reset_index()
freq_df_ext['duration_day'] = (freq_df_ext['max']-freq_df_ext['min'])
freq_df_ext['duration_hr'] = (1/3600)*(freq_df_ext['duration_day']/(np.timedelta64(1, 's')))
freq_df_ext['average_duration'] = freq_df_ext['duration_hr']/freq_df_ext['count']

In [182]:
av_event_hr = freq_df_ext.groupby('user_state')['count'].sum()/freq_df_ext.groupby('user_state')['duration_hr'].sum()
av_event_hr = av_event_hr.reset_index()
av_event_hr.rename(columns= {0: 'average_event'}, inplace = True)
av_event_hr = av_event_hr.sort_values(by = 'average_event', ascending= False)

In [183]:
av_event_hr.head(n=3), av_event_hr.tail(n=3)

(     user_state  average_event
 13       Kansas       0.037549
 37     Virginia       0.025662
 6   Connecticut       0.024049,
     user_state  average_event
 38  Washington       0.011611
 14    Kentucky       0.011499
 30      Oregon       0.011333)

### 3. The CEO wants to send a gift to the first user who signed-up for each state. That is, the first user who signed-up from California, from Oregon, etc. Can you give him a list of those users?

In [186]:
sign_up = song_data.groupby('user_state').user_sign_up_date.min().reset_index()
sign_up.head(n=5)

Unnamed: 0,user_state,user_sign_up_date
0,Alabama,2015-05-01
1,Alaska,2015-05-12
2,Arizona,2015-05-12
3,Arkansas,2015-05-08
4,California,2015-05-04


In [249]:
def find_first_signup(df):
    idx = df.user_sign_up_date.argmin()
    return df.iloc[idx,1:4]

first_users = song_data.groupby("user_state").apply(find_first_signup)
first_users.sort_values(by="user_sign_up_date").head()

Unnamed: 0_level_0,user_id,user_state,user_sign_up_date
user_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,5,Alabama,2015-05-01
Texas,7,Texas,2015-05-01
Oregon,1,Oregon,2015-05-01
Ohio,3,Ohio,2015-05-01
North Carolina,2,North Carolina,2015-05-01


In [None]:
#alternate solution

df = song_data.groupby(['user_state', 'user_id']).user_sign_up_date.min().reset_index
df.sort_values(by = ['user_state', 'user_sign_up_date'], ascending=[True, True])

ids = []
for state in list(df.user_state.unique()):
    df_state = df.loc[df.user_state==state]
    idx = df_state.user_sign_up_date.argmin()
    ids.append(df_state.iloc[idx,:2])

### Recommendation System
Build a function that takes as an input any of the songs in the data and returns the most likely song to be listened next.
That is, if, for instance, a user is currently listening to “Eight Days A Week“, which song has the highest probability of being played right after it by the same user? This is going to be V1 of a song recommendation model.

one approach can be that; make a group of users who have listened to a particular song, say song A
in that group find out what all other songs are popular
So, if a user listens to song A, we can recommend the most popular song from that group.

This appraoch is called "Similarity Matrix" and quite popular in building recommendation systems

In [None]:
# pivot_table = song_data.pivot(index = 'song_played', column = 'user_id', value = ) 

First thought comes to make a pivot table (for all songs, played by users) but that will pose a problem because, it may be that a user may have played a specific song multiple times.

For that, we will use group by and then make a pivot table

In [253]:
df = song_data.groupby(['song_played', 'user_id']).time_played.count().reset_index()
df = df.sort_values(by=['song_played','time_played'], ascending = [True, False], ignore_index=True)
df = df.pivot_table(index = 'song_played', columns = 'user_id', values = 'time_played', fill_value=0)

In [269]:
df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
song_played,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A Day In The Life,0,0,1,3,0,2,0,0,0,0,...,0,0,3,3,0,2,0,0,2,0
A Hard Day's Night,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
A Saturday Club Xmas/Crimble Medley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANYTIME AT ALL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Across The Universe,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [281]:
(df.loc['A Day In The Life', :]!=0).sum()

104

Now, take an example 'A Day In The Life' : 104 users listened to this song, now we need to have a list of all the songs listened by these 104 users in order of popularity (most listened song by this group to least listened song by this group) so that our recommender system can recommend the next song to a user who is listening to 'A Day In The Life'


We can do this by first normalising our song and transforming it to song-song table (matrix multiplication will do the transformation) Intuitively if we multiply song-user table to user-song table it will give us the trannsformed values of a song to song similarity based on user scores.

In [319]:
from sklearn.preprocessing import normalize
df_norm = normalize(df, axis = 1)
song_sim = pd.DataFrame(df_norm.dot(df_norm.T), index = df.index.values, columns = df.index.values)

In [320]:
song_sim.head()

Unnamed: 0,A Day In The Life,A Hard Day's Night,A Saturday Club Xmas/Crimble Medley,ANYTIME AT ALL,Across The Universe,All My Loving,All You Need Is Love,And Your Bird Can Sing,BAD BOY,Baby You're A Rich Man,...,We Can Work It Out,When I'm 64,While My Guitar Gently Weeps,Wild Honey Pie,With a Little Help From My Friends,YOUR MOTHER SHOULD KNOW,Yellow Submarine,Yesterday,You Never Give Me Your Money,You're Going To Lose That Girl
A Day In The Life,1.0,0.246021,0.067651,0.108483,0.192538,0.34176,0.322896,0.138092,0.223237,0.175762,...,0.462712,0.055237,0.509397,0.225503,0.427027,0.033826,0.331593,0.339179,0.079727,0.0
A Hard Day's Night,0.246021,1.0,0.0,0.0,0.1,0.136931,0.111803,0.0,0.0,0.091287,...,0.259548,0.129099,0.210099,0.0,0.112987,0.0,0.05,0.195468,0.074536,0.0
A Saturday Club Xmas/Crimble Medley,0.067651,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.109435,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
ANYTIME AT ALL,0.108483,0.0,0.0,1.0,0.0,0.154303,0.094491,0.109109,0.0,0.0,...,0.116991,0.0,0.138107,0.089087,0.190982,0.0,0.0,0.146845,0.0,0.0
Across The Universe,0.192538,0.1,0.0,0.0,1.0,0.091287,0.0,0.0,0.0,0.0,...,0.138426,0.0,0.116722,0.0,0.075324,0.0,0.0,0.043437,0.0,0.0


In [321]:
g = song_sim.iloc[0, :].sort_values(ascending = False)

In [322]:
g[1:2].index.values

array(['Revolution'], dtype=object)

In [353]:
#function for finding next song

def song_reco(song, top_n= 2):
    g = song_sim.loc[song, :].sort_values(ascending = False)
    g = g[1:top_n+1].index.values
    return pd.Series(g, index = ['next_song#{}'.format(i+1) for i in range(top_n)])

In [323]:
song_sim.loc['Revolution', :].sort_values(ascending = False)

Revolution                             1.000000
Come Together                          0.680807
Get Back                               0.660374
While My Guitar Gently Weeps           0.632051
Back In the U.S.S.R.                   0.617417
                                         ...   
A Saturday Club Xmas/Crimble Medley    0.066716
LITTLE CHILD                           0.062901
GIRL                                   0.047175
THE CONTINUING STORY OF BUNG           0.044477
You're Going To Lose That Girl         0.044477
Name: Revolution, Length: 97, dtype: float64

In [356]:
g = song_reco('Revolution', 5)
g

next_song#1                   Come Together
next_song#2                        Get Back
next_song#3    While My Guitar Gently Weeps
next_song#4            Back In the U.S.S.R.
next_song#5               A Day In The Life
dtype: object

#### How would you set up a test to check whether your model works well?

We need to perform a A/B test:

- randomly split users into two groups, one Control group and one Experiment group
    - Control group has no recommendation strategy
    - Experiment group recommend the next song
    - after running some time, perform a one-tailed t-test on 'average #play per hour'
- $H_0$: population 'average #play per hour' is same in two groups
- $H_a$: experiment group's population 'average #play per hour' is higher than control group's