In [96]:
# import essential libraries
import numpy as np
import pandas as pd 
import scipy

In [97]:
input_path = "/home/mahendra/data/dangal/"

In [98]:
# read all events data
uid_events = pd.read_csv(input_path+"uid_events/be3293dc83fa4da4b356c1c01f0cfa91_000000", sep=chr(1), names=['uid', 'model', 'ev', 'songid', 'top_src', 'top_src_id', 'top_src_type', 'city', 'ts'])

In [99]:
uid_events.head(3)

Unnamed: 0,uid,model,ev,songid,top_src,top_src_id,top_src_type,city,ts
0,COVG+iS5EmM/rVVq+va4Go/PimSbhDZ1wscgTM48OzI=,redmi 4,home:ui::view,\N,\N,\N,\N,lucknow,1507094456
1,COVG+iS5EmM/rVVq+va4Go/PimSbhDZ1wscgTM48OzI=,redmi 4,language_select:onboarding::unclick,\N,\N,\N,\N,lucknow,1507094461
2,COVG+iS5EmM/rVVq+va4Go/PimSbhDZ1wscgTM48OzI=,redmi 4,language_select:onboarding::click,\N,\N,\N,\N,lucknow,1507094465


In [100]:
uid_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20275013 entries, 0 to 20275012
Data columns (total 9 columns):
uid             object
model           object
ev              object
songid          object
top_src         object
top_src_id      object
top_src_type    object
city            object
ts              int64
dtypes: int64(1), object(8)
memory usage: 1.4+ GB


Now, to prepare data for matrix factorization we will be getting count for each pair of (uid,city). This can be done by simply doing group by on these two columns and then convert to sparse matrix through pivot tables.

In [101]:
uid_city_grp = uid_events.groupby(['uid','city'])
print uid_city_grp.size()[:5]

uid                                           city      
++6hguLjcB3jtJhQWvIL0X4n29dJkmlqdfOpeJ7MCL8=  ranchi         285
++TCdWzyV4m4OkSkmQgGSF/jwEX3cBBq+D+oRAzRSq8=  pitampura     1347
++fTW0762GVu9KcW49AvOmgUmLxE1kg8UzytJ193ALI=  bara bazar      23
                                              barakpur       297
                                              garui          468
dtype: int64


In [102]:
uid_city_grp = uid_city_grp.size()
uid_city_grp = uid_city_grp.reset_index(inplace=False)
uid_city_grp.columns = ['uid', 'city', 'count']
print(uid_city_grp.info())
print(uid_city_grp.head(3))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30230 entries, 0 to 30229
Data columns (total 3 columns):
uid      30230 non-null object
city     30230 non-null object
count    30230 non-null int64
dtypes: int64(1), object(2)
memory usage: 708.6+ KB
None
                                            uid        city  count
0  ++6hguLjcB3jtJhQWvIL0X4n29dJkmlqdfOpeJ7MCL8=      ranchi    285
1  ++TCdWzyV4m4OkSkmQgGSF/jwEX3cBBq+D+oRAzRSq8=   pitampura   1347
2  ++fTW0762GVu9KcW49AvOmgUmLxE1kg8UzytJ193ALI=  bara bazar     23


In [104]:
# optional to save uid_city_count
uid_city_grp.to_csv('/home/mahendra/data/dangal/feat/uid_city_count', sep='\t', header=False, index=False)

import implicit library to do matrix factorization on local machine, if the data is bigger like having more than 50K unique uids or cities then its better to do it in spark. You can have a look at this example : https://spark.apache.org/docs/2.2.0/mllib-collaborative-filtering.html

In [105]:
import implicit

In [112]:
print(uid_city_grp.info())
print(uid_city_grp.head(3))
print(uid_city_grp.apply(lambda x: x.nunique(), axis=0))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30230 entries, 0 to 30229
Data columns (total 3 columns):
uid      30230 non-null object
city     30230 non-null object
count    30230 non-null int64
dtypes: int64(1), object(2)
memory usage: 708.6+ KB
None
                                            uid        city  count
0  ++6hguLjcB3jtJhQWvIL0X4n29dJkmlqdfOpeJ7MCL8=      ranchi    285
1  ++TCdWzyV4m4OkSkmQgGSF/jwEX3cBBq+D+oRAzRSq8=   pitampura   1347
2  ++fTW0762GVu9KcW49AvOmgUmLxE1kg8UzytJ193ALI=  bara bazar     23
uid      9997
city     2380
count    3764
dtype: int64


^ we have 9997 unique uids and 2380 unique cities. Now lets use pivot_table to convert in into sparse matrix form, where every row will correspond to an uid and cities will represented by columns.

In [114]:
uid_city_table = pd.pivot_table(uid_city_grp, values='count', index=['uid'], columns=['city'], aggfunc=np.sum)
uid_city_table.fillna(0, inplace=True)
uid_city_table.reset_index(inplace=True)
print uid_city_table.head(3)
print uid_city_table.info()

city                                           uid   \N  abdullapur  abhaneri  \
0     ++6hguLjcB3jtJhQWvIL0X4n29dJkmlqdfOpeJ7MCL8=  0.0         0.0       0.0   
1     ++TCdWzyV4m4OkSkmQgGSF/jwEX3cBBq+D+oRAzRSq8=  0.0         0.0       0.0   
2     ++fTW0762GVu9KcW49AvOmgUmLxE1kg8UzytJ193ALI=  0.0         0.0       0.0   

city  abhayapuri  abohar  abu  abu road  achalpur  achhnera  ...   yarada  \
0            0.0     0.0  0.0       0.0       0.0       0.0  ...      0.0   
1            0.0     0.0  0.0       0.0       0.0       0.0  ...      0.0   
2            0.0     0.0  0.0       0.0       0.0       0.0  ...      0.0   

city  yaval  yavatmal  yelahanka  yelandur  yeola  zahirabad  zaidpur  \
0       0.0       0.0        0.0       0.0    0.0        0.0      0.0   
1       0.0       0.0        0.0       0.0    0.0        0.0      0.0   
2       0.0       0.0        0.0       0.0    0.0        0.0      0.0   

city  zamania  zira  
0         0.0   0.0  
1         0.0   0.0  
2      

Lets remove uid column and convert this to matrix. Quoting directly from implicit's documentation :
"Parameters:	item_users (csr_matrix) – Matrix of confidences for the liked items. This matrix should be a csr_matrix where the rows of the matrix are the item, the columns are the users that liked that item, and the value is the confidence that the user liked the item."
Because imlicit's als version accepts items (i.e. cities in our case) to be represented in rows and users in columns, we will be taking transpose to do this. 

Note that spark's als version expects just opposite of this.

In [116]:
# these are the matrix values we will be feeding to als
print uid_city_table[uid_city_table.columns.difference(['uid'])].values.transpose()

[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [117]:
# lets get a csr matrix out of those values
uid_city_matrix = scipy.sparse.csr_matrix(uid_city_table[uid_city_table.columns.difference(['uid'])].values.transpose())

Like spark's als version in implicit also we need to give user and item identifiers as integer values, hence we 
will keep a map uids and there integer IDs, similarly for cities.

In [118]:
uid_index_map = dict([(x[0],x[1]) for x in zip(uid_city_table['uid'], range(0, 9997)) ])
index_uid_map = dict([(x[1],x[0]) for x in zip(uid_city_table['uid'], range(0, 9997)) ])

item_index_map = dict([(x[0],x[1]) for x in zip(list(uid_city_table.columns[1:]), range(0, len(list(uid_city_table.columns[1:])))) ])
index_item_map = dict([(x[1],x[0]) for x in zip(list(uid_city_table.columns[1:]), range(0, len(list(uid_city_table.columns[1:])))) ])

In [119]:
# this how those map looks like
print uid_index_map['++fTW0762GVu9KcW49AvOmgUmLxE1kg8UzytJ193ALI=']
print item_index_map['mumbai']
print index_item_map[1520]
print item_index_map['delhi']

2
1520
mumbai
606


In [138]:
# initialize a model
# do read the documentation of implicit.als.AlternatingLeastSquares at 
# http://implicit.readthedocs.io/en/latest/als.html
alsmodel = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.05, iterations=20)



In [139]:
alsmodel.fit(uid_city_matrix)

In [140]:
# lets get similar cities to mumbai
alsmodel.similar_items(1520, N=10)

[(1520, 0.99999999999999989),
 (2319, 0.75685800295630901),
 (309, 0.66043981345789304),
 (2209, 0.59776114966578076),
 (1838, 0.47796748013709783),
 (569, 0.44503563569664439),
 (1821, 0.44149123323093686),
 (429, 0.43729040672992597),
 (2268, 0.4137829723414711),
 (0, 0.32797630525406302)]

In [141]:
[index_item_map.get(x[0], '') for x in alsmodel.similar_items(1520, N=20)]

['mumbai',
 'vashi',
 'belapur',
 'thane',
 'pune',
 'dahihanda',
 'powai',
 'borivli',
 'udgir',
 '\\N',
 'ahmedabad',
 'ghatkopar',
 'seohara',
 'malad',
 'uran',
 'diglur',
 'bengaluru',
 'chandkheda',
 'chandigarh',
 'navi mumbai']

First few similar items looks good, but we are getting some outliers too like ahemdabad, bengaluru, chandigarh. So, now question is how do you evaluate what parameters for your als model to choose. That is you need to estimate best parameters for factors and regularization. You may alter number of iteration but it doesn't impact end results much because ALS is known for faster convergence within 20 iterations. For model evaluation, you need to decide a metric on which you can compare two models.

One approach that is mostly used in recommender systems is to use mean percentile ranking. Read this paper (http://yifanhu.net/PUB/cf.pdf) which describes about both ALS and evaluation technique using mean percentile ranking. Try to implement that yourself.

Just using another set of params which i found would work better for this data is below :
factors=15, regularization=0.3

In [152]:
alsmodel = implicit.als.AlternatingLeastSquares(factors=15, regularization=0.3, iterations=20)



In [153]:
alsmodel.fit(uid_city_matrix)

In [154]:
# lets get similar cities to mumbai
alsmodel.similar_items(1520, N=10)

[(1520, 1.0),
 (2319, 0.93941511097203689),
 (309, 0.93473712919597118),
 (2268, 0.8750816822050268),
 (2289, 0.73616872115281207),
 (2209, 0.68031698291390208),
 (569, 0.67033938313414998),
 (1382, 0.66956665040985186),
 (99, 0.64465364049714213),
 (1838, 0.54987020235025019)]

In [155]:
[index_item_map.get(x[0], '') for x in alsmodel.similar_items(1520, N=20)]

['mumbai',
 'vashi',
 'belapur',
 'udgir',
 'uran',
 'thane',
 'dahihanda',
 'malad',
 'andheri',
 'pune',
 'borivli',
 'powai',
 'seohara',
 'bhandup',
 'matheran',
 'shahpura',
 'gevrai',
 'chandigarh',
 '\\N',
 'jiran']

you can see that above results looks much better with parameter tuning. You can use item_factors method to get vector for every city.