In [1]:
PROJECT = 'qwiklabs-gcp-4a684069c4776675'
BUCKET = 'colaborative-filtering-agea'
REGION = 'us-central1'

In [2]:
import tensorflow as tf
from tensorflow.contrib.factorization import WALSMatrixFactorization
import google.datalab.bigquery as bq
import numpy as np

  from ._conv import register_converters as _register_converters


In [9]:
!gsutil cp gs://{BUCKET}/wals/data/* data

Copying gs://colaborative-filtering-agea/wals/data/batch_pred.txt...
Copying gs://colaborative-filtering-agea/wals/data/collab_mapped.csv...         
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

Copying gs://colaborative-filtering-agea/wals/data/collab_raw.csv...            
Copying gs://colaborative-filtering-agea/wals/data/items.csv...                 
- [4 files][  1.4 GiB/  1.4 GiB]    2.3 MiB/s                                   
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://colaborative-filtering-agea/wals/data/items_for_user...
Copying gs://colaborative-fil

In [10]:
def get_factors(args):
  with tf.Session() as sess:
    estimator = WALSMatrixFactorization(
      num_rows=args['nusers'], num_cols=args['nitems'],
      embedding_dimension=args['n_embeds'],
      model_dir=args['model_dir']
    )
    row_factors = estimator.get_row_factors()[0]
    col_factors = estimator.get_col_factors()[0]
    return row_factors, col_factors

In [11]:
args = {
    'model_dir': 'gs://{}/wals/model_trained'.format(BUCKET),
    'nitems': 18716,
    'nusers': 1086609,
    'n_embeds': 10
  }

user_embeddings, item_embeddings = get_factors(args)

Instructions for updating:
Please replace uses of any Estimator from tf.contrib.learn with an Estimator from tf.estimator.*
Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f129a7b67b8>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_num_worker_replicas': 0, '_task_id': 0, '_tf_random_seed': None, '_is_chief': True, '_train_distribute': None, '_environment': 'local', '_task_type': None, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': 'gs://colaborative-filtering-agea/wals/model_trained', '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_num_ps_replicas': 0, '_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_master': '', '_save_checkpoints_secs': 600}


In [12]:
import csv
def index_to_id(filename):
  with open(filename, 'r') as csvfile:
    index_to_id = {}
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        index_to_id[row[1]] = row[0]
  return index_to_id

In [13]:
def create_factors_file(indexes,values, filename):
  with open(filename, 'w') as ofp:
    for id, value in enumerate(values):
      value = ','.join(map(str,value))
      ofp.write('{},{}\n'.format(indexes.get(str(id)), value))

In [14]:
user_indexes = index_to_id('data/users.csv')
create_factors_file(user_indexes,user_embeddings,'data/user-factors.csv')

In [15]:
!head data/user-factors.csv

3127735,-0.0017153154,0.0032848527,0.0076523665,-0.0019144089,-0.0059423726,-0.0017860208,0.0027878233,0.0008396567,-0.0009451813,-0.0009942508
1402547,-0.0017607884,0.0021899592,0.00084968493,-0.0033975916,0.0017300133,-0.00020751615,0.00089092355,0.0024477844,0.0010480098,0.00034418874
4424328,4.5968216e-05,-0.00017753341,0.00012805944,-4.1691e-05,-0.00015069611,-0.00010710357,-0.00011879814,1.8828776e-05,-6.2402294e-05,-6.806698e-05
3604872,-0.0014606222,-0.00011086347,0.0011216926,-0.0027857579,0.002231331,0.0020026544,-0.0056454516,0.0020387652,0.003127153,6.9451125e-05
3627874,-0.00323687,-0.0013578553,0.009516894,0.0031726437,0.0010611685,0.0012763955,-0.005059517,0.0004372349,0.006737452,0.0007977917
3862440,0.0011209525,0.0030286198,0.006886179,-0.0051683662,-0.001690426,0.0059568845,-0.011305199,-0.006216809,-0.0020964425,0.0032636446
4961169,-0.0016805363,-0.0013381562,-0.00063323614,-0.00081844523,-0.000707734,0.0017801853,-0.00014434256,0.0032968502,0.0029611434,-0.0

In [16]:
item_indexes = index_to_id('data/items.csv')
create_factors_file(item_indexes,user_embeddings,'data/items-factors.csv')

In [17]:
!head data/items-factors.csv

JVAFAfO5p,-0.0017153154,0.0032848527,0.0076523665,-0.0019144089,-0.0059423726,-0.0017860208,0.0027878233,0.0008396567,-0.0009451813,-0.0009942508
YhYp5WCm-,-0.0017607884,0.0021899592,0.00084968493,-0.0033975916,0.0017300133,-0.00020751615,0.00089092355,0.0024477844,0.0010480098,0.00034418874
ekUWVdEXY,4.5968216e-05,-0.00017753341,0.00012805944,-4.1691e-05,-0.00015069611,-0.00010710357,-0.00011879814,1.8828776e-05,-6.2402294e-05,-6.806698e-05
Y-vZd0XoQ,-0.0014606222,-0.00011086347,0.0011216926,-0.0027857579,0.002231331,0.0020026544,-0.0056454516,0.0020387652,0.003127153,6.9451125e-05
2nvx6mPGW,-0.00323687,-0.0013578553,0.009516894,0.0031726437,0.0010611685,0.0012763955,-0.005059517,0.0004372349,0.006737452,0.0007977917
9P5DtZj9t,0.0011209525,0.0030286198,0.006886179,-0.0051683662,-0.001690426,0.0059568845,-0.011305199,-0.006216809,-0.0020964425,0.0032636446
0bMjAUf8i,-0.0016805363,-0.0013381562,-0.00063323614,-0.00081844523,-0.000707734,0.0017801853,-0.00014434256,0.0032968502,0.0

In [18]:
!gsutil cp data/*-factors.csv gs://{BUCKET}/data

Copying file://data/items-factors.csv [Content-Type=text/csv]...
Copying file://data/user-factors.csv [Content-Type=text/csv]...                 
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\ [2 files][299.5 MiB/299.5 MiB]                                                
Operation completed over 2 objects/299.5 MiB.                                    


In [3]:
def get_factors_columns_casted(factors_table, factors_name):
  return  ",'|',".join(["cast({}._{} as STRING) ".format(factors_table,i,factors_name,i) for i in range(10)])

In [4]:
def get_articles_vector_average():
  vector_columns = ["v{}".format(i) for i in range(50)]
  vector_columns_str = ",".join(vector_columns)
  average_columns_str = ",'|',".join(map(lambda x: 'CAST(AVG({}) as STRING)'.format(x),vector_columns))
  query = """
      SELECT
        user_id,
        CONCAT({}) AS doc2vec_avg
      FROM
        `AGEA_ASL.Dataset_D` 
      GROUP BY user_id
  """
  return query.format(average_columns_str)

In [13]:
def get_hybrid_dataset_query():
  vector_columns = ["d.v{}".format(i) for i in range(50)]
  vector_columns_with_alias = ",'|',".join(["CAST(d.v{} as STRING)".format(i,i) for i in range(50)])
  
  user_factors_columns  =  get_factors_columns_casted('user_factors','user_factor')
  item_factors_columns  =  get_factors_columns_casted('item_factors','item_factor')
  
  query = """
      SELECT
        d.user_id,
        d.content_id,
        a.title,
        a.section_1,
        a.tag_1,
        users.gender,
        users.age,
        CONCAT({}) as d2v,
        CONCAT({}) as user_factors,
        CONCAT({}) as item_factors,
        FARM_FINGERPRINT(CONCAT(CAST(d.user_id AS STRING), CAST(d.content_id AS STRING))) AS hash_id,
        LEAD(d.content_id,1) OVER (PARTITION BY d.user_id ORDER BY CAST(day_write AS DATETIME) DESC) AS next_article,
        article_vector_average.doc2vec_avg
      FROM
        `AGEA_ASL.Dataset_D` d 
        JOIN `AGEA_ASL.Dataset_A` a ON a.content_id = d.content_id
        JOIN `AGEA_ASL.user_factors` user_factors ON d.user_id = user_factors.user_id
        JOIN `AGEA_ASL.item_factors` item_factors ON d.content_id = item_factors.content_id
        JOIN ({}) as article_vector_average ON article_vector_average.user_id = d.user_id
        JOIN `AGEA_ASL.users_data` users ON users.user_id = d.user_id
      LIMIT 10
  """.format(vector_columns_with_alias,user_factors_columns,item_factors_columns, get_articles_vector_average())
  return query



In [None]:
hybrid_query = get_hybrid_dataset_query()
hybrid_dataset = bq.Query(hybrid_query).execute().result().to_dataframe()

In [None]:
print(hybrid_query)

In [None]:
hybrid_dataset.head()

In [None]:
df.to_csv('data/hybrid_dataset.csv',index=False)

In [None]:
!gsutil cp data/hybrid_dataset.csv gs://{BUCKET}/wals/data

In [None]:
!head data/hybrid_dataset.csv

In [18]:
",'|',".join(["d.v{}".format(i,i) for i in range(50)])

"d.v0,'|',d.v1,'|',d.v2,'|',d.v3,'|',d.v4,'|',d.v5,'|',d.v6,'|',d.v7,'|',d.v8,'|',d.v9,'|',d.v10,'|',d.v11,'|',d.v12,'|',d.v13,'|',d.v14,'|',d.v15,'|',d.v16,'|',d.v17,'|',d.v18,'|',d.v19,'|',d.v20,'|',d.v21,'|',d.v22,'|',d.v23,'|',d.v24,'|',d.v25,'|',d.v26,'|',d.v27,'|',d.v28,'|',d.v29,'|',d.v30,'|',d.v31,'|',d.v32,'|',d.v33,'|',d.v34,'|',d.v35,'|',d.v36,'|',d.v37,'|',d.v38,'|',d.v39,'|',d.v40,'|',d.v41,'|',d.v42,'|',d.v43,'|',d.v44,'|',d.v45,'|',d.v46,'|',d.v47,'|',d.v48,'|',d.v49"