<a href="https://colab.research.google.com/github/marianoogimenez/google-asl-project/blob/master/recommendation-system/collaborative-filtering/create-wals-dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative filtering using WALS algorithm  | 2

## Create walls dataset

In [0]:
PROJECT = 'qwiklabs-gcp-4a684069c4776675'
BUCKET = 'colaborative-filtering-agea'
REGION = 'us-central1'

In [0]:
from google.colab import auth
import pandas as pd
import numpy as np
import tensorflow as tf

In [0]:
auth.authenticate_user()

### Mapping

In [0]:
def create_mapping(values, filename):
  with open(filename, 'w') as ofp:
    value_to_id = {value:idx for idx, value in enumerate(values.unique())}
    for value, idx in value_to_id.items():
      ofp.write('{},{}\n'.format(value, idx))
  return value_to_id


In [23]:
!gsutil cp gs://{BUCKET}/collab_raw.csv  'data/collab_raw.csv'  

Copying gs://colaborative-filtering-agea/collab_raw.csv...
/ [1 files][ 21.5 KiB/ 21.5 KiB]                                                
Operation completed over 1 objects/21.5 KiB.                                     


In [0]:
df = pd.read_csv('data/collab_raw.csv',
                 header=None,
                 names=['visitorId', 'contentId', 'rating'],
                dtype={'visitorId': str, 'contentId': str, 'rating': np.float})

In [0]:
user_mapping = create_mapping(df['visitorId'], 'data/users.csv')
item_mapping = create_mapping(df['contentId'], 'data/items.csv')

In [26]:
#copy to bucket
!gsutil cp 'data/users.csv'  gs://{BUCKET}/data/users.csv   
!gsutil cp 'data/items.csv'  gs://{BUCKET}/data/items.csv  

Copying file://data/users.csv [Content-Type=text/csv]...
/ [1 files][ 11.6 KiB/ 11.6 KiB]                                                
Operation completed over 1 objects/11.6 KiB.                                     
Copying file://data/items.csv [Content-Type=text/csv]...
/ [1 files][ 12.0 KiB/ 12.0 KiB]                                                
Operation completed over 1 objects/12.0 KiB.                                     


In [0]:
df['userId'] = df['visitorId'].map(user_mapping.get)
df['itemId'] = df['contentId'].map(item_mapping.get)

In [28]:
mapped_df = df[['userId', 'itemId', 'rating']]
mapped_df.to_csv('data/collab_mapped.csv', index=False, header=False)
mapped_df.head()

Unnamed: 0,userId,itemId,rating
0,0,0,0.3
1,1,1,0.3
2,2,2,0.3
3,3,3,0.3
4,4,4,0.3


In [29]:
#copy to bucket
!gsutil cp 'data/collab_mapped.csv'  gs://{BUCKET}/collab_mapped.csv   

Copying file://data/collab_mapped.csv [Content-Type=text/csv]...
/ [1 files][ 11.5 KiB/ 11.5 KiB]                                                
Operation completed over 1 objects/11.5 KiB.                                     


In [30]:
mapped_df = pd.read_csv('data/collab_mapped.csv', header=None, names=['userId', 'itemId', 'rating'])
NITEMS = np.max(mapped_df['itemId']) + 1
NUSERS = np.max(mapped_df['userId'])+1
mapped_df['rating'] = np.round(mapped_df['rating'].values, 2)
print('{} items, {} users, {} interactions'.format( NITEMS, NUSERS, len(mapped_df) ))

884 items, 998 users, 1000 interactions


In [0]:
grouped_by_items = mapped_df.groupby('itemId')

In [32]:
iter = 0
for item, grouped in grouped_by_items:
  print(item, grouped['userId'].values, grouped['rating'].values)
  iter = iter + 1
  if iter > 5:
    break

0 [ 0 26] [0.3 0.3]
1 [1] [0.3]
2 [2] [0.3]
3 [3] [0.3]
4 [4] [0.3]
5 [  5 341] [0.3 0.3]


In [0]:
grouped_by_items = mapped_df.groupby('itemId')
with tf.python_io.TFRecordWriter('data/users_for_item') as ofp:
  for item, grouped in grouped_by_items:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[item])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['userId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())

In [0]:
grouped_by_users = mapped_df.groupby('userId')
with tf.python_io.TFRecordWriter('data/items_for_user') as ofp:
  for user, grouped in grouped_by_users:
    example = tf.train.Example(features=tf.train.Features(feature={
          'key': tf.train.Feature(int64_list=tf.train.Int64List(value=[user])),
          'indices': tf.train.Feature(int64_list=tf.train.Int64List(value=grouped['itemId'].values)),
          'values': tf.train.Feature(float_list=tf.train.FloatList(value=grouped['rating'].values))
        }))
    ofp.write(example.SerializeToString())

In [35]:
!ls -lrt data

total 196
-rw-r--r-- 1 root root 22036 Mar  3 18:44 collab_raw.csv
-rw-r--r-- 1 root root 11864 Mar  3 18:44 users.csv
-rw-r--r-- 1 root root 12274 Mar  3 18:44 items.csv
-rw-r--r-- 1 root root 11784 Mar  3 18:44 collab_mapped.csv
-rw-r--r-- 1 root root 64088 Mar  3 18:44 users_for_item
-rw-r--r-- 1 root root 71583 Mar  3 18:44 items_for_user


In [36]:
#copy to bucket
!gsutil cp 'data/users_for_item'  gs://{BUCKET}/data/users_for_item   
!gsutil cp 'data/items_for_user'  gs://{BUCKET}/data/items_for_user   

Copying file://data/users_for_item [Content-Type=application/octet-stream]...
/ [1 files][ 62.6 KiB/ 62.6 KiB]                                                
Operation completed over 1 objects/62.6 KiB.                                     
Copying file://data/items_for_user [Content-Type=application/octet-stream]...
/ [1 files][ 69.9 KiB/ 69.9 KiB]                                                
Operation completed over 1 objects/69.9 KiB.                                     
