# Collaborative filtering using WALS algorithm  | 2

## Create walls dataset

In [3]:
PROJECT = 'qwiklabs-gcp-4a684069c4776675'
BUCKET = 'colaborative-filtering-agea'
REGION = 'us-central1'

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [0]:
from google.colab import auth
auth.authenticate_user()

### Mapping

In [5]:
def create_mapping(values, filename):
  with open(filename, 'w') as ofp:
    value_to_id = {value:idx for idx, value in enumerate(values.unique())}
    for value, idx in value_to_id.items():
      ofp.write('{},{}\n'.format(value, idx))
  return value_to_id


In [6]:
!gsutil cp gs://{BUCKET}/collab_raw.csv  'data/collab_raw.csv'  

Copying gs://colaborative-filtering-agea/collab_raw.csv...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][835.9 MiB/835.9 MiB]                                                
Operation completed over 1 objects/835.9 MiB.                                    


In [7]:
df = pd.read_csv('data/collab_raw.csv',
                 header=None,
                 names=['visitorId', 'contentId', 'rating'],
                dtype={'visitorId': str, 'contentId': str, 'rating': np.float})

In [8]:
user_mapping = create_mapping(df['visitorId'], 'data/users.csv')
item_mapping = create_mapping(df['contentId'], 'data/items.csv')

In [9]:
#copy to bucket
!gsutil cp 'data/users.csv'  gs://{BUCKET}/data/users.csv   
!gsutil cp 'data/items.csv'  gs://{BUCKET}/data/items.csv  

Copying file://data/users.csv [Content-Type=text/csv]...
/ [1 files][ 15.5 MiB/ 15.5 MiB]                                                
Operation completed over 1 objects/15.5 MiB.                                     
Copying file://data/items.csv [Content-Type=text/csv]...
/ [1 files][281.8 KiB/281.8 KiB]                                                
Operation completed over 1 objects/281.8 KiB.                                    


In [10]:
df['userId'] = df['visitorId'].map(user_mapping.get)
df['itemId'] = df['contentId'].map(item_mapping.get)

In [12]:
mapped_df = df[['userId', 'itemId', 'rating']]
mapped_df.to_csv('data/collab_mapped.csv', index=False, header=False)
mapped_df.head()

Unnamed: 0,userId,itemId,rating
0,0,0,0.3
1,1,1,0.3
2,2,2,0.3
3,3,3,0.3
4,4,4,0.3


In [13]:
#copy to bucket
!gsutil cp 'data/collab_mapped.csv'  gs://{BUCKET}/collab_mapped.csv   

Copying file://data/collab_mapped.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

| [1 files][584.7 MiB/584.7 MiB]                                                
Operation completed over 1 objects/584.7 MiB.                                    


In [14]:
mapped_df = pd.read_csv('data/collab_mapped.csv', header=None, names=['userId', 'itemId', 'rating'])
NITEMS = np.max(mapped_df['itemId']) + 1
NUSERS = np.max(mapped_df['userId'])+1
mapped_df['rating'] = np.round(mapped_df['rating'].values, 2)
print('{} items, {} users, {} interactions'.format( NITEMS, NUSERS, len(mapped_df) ))

18715 items, 1086609 users, 39805654 interactions


In [15]:
grouped_by_items = mapped_df.groupby('itemId')

In [16]:
iter = 0
for item, grouped in grouped_by_items:
  print(item, grouped['userId'].values, grouped['rating'].values)
  iter = iter + 1
  if iter > 5:
    break

0 [     0   5021  10934 ...   3511  90864 261148] [0.3 0.6 0.3 ... 0.3 0.3 0.3]
1 [     1   3374   5969 ... 220505 194098 158559] [0.3 0.3 0.3 ... 0.3 0.3 0.3]
2 [     2   3871 176572 373585 452550 159525 134694 484080 295122  24213
  68074 524830   7949  56810 226671 414522 488148 370721 330521 146764
 364879 357275 554002 645689 104938 124695 613265 108011 505155   1666
 749902  32414  23650 929761 174658 649411  26677  83009 311890  21238
  97548 559865  88011 425794   1183  12461 280568  46510  88463 912410
 219975 369836 981777 515242 247724 275596 998094  38669 443068  95711
 475341  30630 680153 494881 624552 107050   8898 778535   3381 272900
 124653 661368 247118 549424 317373 284688 164981 132233 180197 296518
 266172 115874 351902  61918 462685 434978 290913 521438 358747 644847
 661394 175949  66788  28899 137625 159507] [0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.6 0.3
 0.3 0.3 0.3 0.6 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.6 0.3 0.3
 0.6 0.3 

In [17]:
grouped_by_items = mapped_df.groupby('itemId')
with tf.python_io.TFRecordWriter('data/users_for_item') as ofp:
  for item, grouped in grouped_by_items:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[item])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['userId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())

In [18]:
grouped_by_users = mapped_df.groupby('userId')
with tf.python_io.TFRecordWriter('data/items_for_user') as ofp:
  for user, grouped in grouped_by_users:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[user])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['itemId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())

In [19]:
!ls -lrt data

total 2046820
-rw-r--r-- 1 root root 876506370 Mar  5 19:05 collab_raw.csv
-rw-r--r-- 1 root root  16272129 Mar  5 19:06 users.csv
-rw-r--r-- 1 root root    288553 Mar  5 19:06 items.csv
-rw-r--r-- 1 root root 613068836 Mar  5 19:11 collab_mapped.csv
-rw-r--r-- 1 root root 276544461 Mar  5 19:13 users_for_item
-rw-r--r-- 1 root root 313242326 Mar  5 19:18 items_for_user


In [20]:
#copy to bucket
!gsutil cp 'data/users_for_item'  gs://{BUCKET}/data/users_for_item   
!gsutil cp 'data/items_for_user'  gs://{BUCKET}/data/items_for_user   

Copying file://data/users_for_item [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

| [1 files][263.7 MiB/263.7 MiB]                                                
Operation completed over 1 objects/263.7 MiB.                                    
Copying file://data/items_for_user [Content-Type=application/octet-stream