<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/train_user_state_dict_for_inference_encdec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable
# !pip install -U modin[all] # -U for upgrade in case you have an older version
!pip install --upgrade pandas==1.1.5
!pip install --upgrade numpy==1.19.4
!pip install tensorflow==2.4.0
!pip install sparse

In [2]:
! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json
! echo '{"username":"nisarahamedk","key":"a4c9157c35890c5e59947d2263e3b518"}' > ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d rohanrao/riiid-train-data-multiple-formats

Downloading riiid-train-data-multiple-formats.zip to /home/nisarkavungal_gmx_com/kaggle-riid/notebooks
100%|█████████████████████████████████████▉| 4.16G/4.17G [02:25<00:00, 43.3MB/s]
100%|██████████████████████████████████████| 4.17G/4.17G [02:25<00:00, 30.7MB/s]


In [5]:
!unzip riiid-train-data-multiple-formats

Archive:  riiid-train-data-multiple-formats.zip
  inflating: riiid_train.feather     
  inflating: riiid_train.h5          
  inflating: riiid_train.jay         
  inflating: riiid_train.parquet     
  inflating: riiid_train.pkl.gzip    


In [1]:
import pickle
import math

import gcsfs
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from datatable import dt, f, by, count
import tensorflow as tf
import sparse


np.random.seed(42)

In [2]:
SEQ_LEN = 512

In [3]:
table = dt.fread("riiid_train.jay")

In [4]:
table = table[f.content_type_id==0, :] # removing lectures
table.head()

Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,0.0
2,2,118363,115,128,0,0,0,1,55000.0,0.0
3,3,131167,115,7860,0,3,0,1,19000.0,0.0
4,4,137965,115,7922,0,4,1,1,11000.0,0.0
5,5,157063,115,156,0,5,2,1,5000.0,0.0
6,6,176092,115,51,0,6,0,1,17000.0,0.0
7,7,194190,115,50,0,7,3,1,17000.0,0.0
8,8,212463,115,7896,0,8,2,1,16000.0,0.0
9,9,230983,115,7863,0,9,0,1,16000.0,0.0


max values for determining the corret data type to use

In [5]:
max_dict = table[:, [dt.max(f.timestamp), dt.max(f.prior_question_elapsed_time), dt.max(f.user_id), dt.max(f.content_id), dt.max(f.task_container_id)]].to_dict()

In [6]:
max_dict

{'timestamp': [87425772049],
 'prior_question_elapsed_time': [300000.0],
 'user_id': [2147482888],
 'content_id': [13522],
 'task_container_id': [9999]}

In [5]:
user_table = table[-(SEQ_LEN-1):, :, by("user_id")] # last SEQ_LEN-1 rows of all the user ids, this is the history we need at inference time

In [6]:
user_table.head()

Unnamed: 0_level_0,user_id,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,115,0,0,5692,0,1,3,1,,
1,115,1,56943,5716,0,2,2,1,37000.0,0.0
2,115,2,118363,128,0,0,0,1,55000.0,0.0
3,115,3,131167,7860,0,3,0,1,19000.0,0.0
4,115,4,137965,7922,0,4,1,1,11000.0,0.0
5,115,5,157063,156,0,5,2,1,5000.0,0.0
6,115,6,176092,51,0,6,0,1,17000.0,0.0
7,115,7,194190,50,0,7,3,1,17000.0,0.0
8,115,8,212463,7896,0,8,2,1,16000.0,0.0
9,115,9,230983,7863,0,9,0,1,16000.0,0.0


In [7]:

dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [8]:
# convert to pandas DF required cols
train_df = user_table[
                      f.content_type_id==0, 
                      [
                       f.timestamp, 
                       f.user_id, 
                       f.content_id, 
                       f.task_container_id, 
                       f.prior_question_elapsed_time, 
                       f.prior_question_had_explanation,
                       f.answered_correctly
                       ]
                      ].to_pandas()
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly
0,0,115,5692,1,,,1
1,56943,115,5716,2,37000.0,False,1
2,118363,115,128,0,55000.0,False,1
3,131167,115,7860,3,19000.0,False,1
4,137965,115,7922,4,11000.0,False,1


In [9]:
del table
del user_table

In [10]:

DATA_PATH = 'gs://kds-e7d6db6554e83e3f4182aa828879e31bf5c122e568c9ee97ab5d891f'
questions_df = pd.read_csv(DATA_PATH + "/questions.csv", index_col="question_id")
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1
questions_df.head()

Unnamed: 0_level_0,bundle_id,correct_answer,part,tags
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,1,51 131 162 38
1,1,1,1,131 36 81
2,2,0,1,131 101 162 92
3,3,0,1,131 149 162 29
4,4,3,1,131 5 162 38


In [11]:
# join question for feaures
train_df = train_df.join(questions_df, on="content_id")

In [12]:
# 0, 1, 2 are special tokens, so increment 3
indicator_cols = ["content_id", "task_container_id", "part", "answered_correctly", "prior_question_had_explanation"]
for c in indicator_cols:
  train_df[c] = train_df[c] + 3
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly,bundle_id,correct_answer,part,tags
0,0,115,5695,4,,,4,5692,3,8,151
1,56943,115,5719,5,37000.0,3.0,4,5716,2,8,168
2,118363,115,131,3,55000.0,3.0,4,128,0,4,131 149 92
3,131167,115,7863,6,19000.0,3.0,4,7860,0,4,131 104 81
4,137965,115,7925,7,11000.0,3.0,4,7922,1,4,131 149 92


In [13]:
# same treatment for the tags
train_df["tags"] = train_df["tags"].apply(lambda row: " ".join([str(int(x)+3) for x in row.split(" ")]))

In [14]:
train_df = train_df[["user_id", "timestamp", "content_id", "task_container_id", "prior_question_elapsed_time", "prior_question_had_explanation", "part", "answered_correctly", "tags"]]

In [15]:
train_df.head()

Unnamed: 0,user_id,timestamp,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,part,answered_correctly,tags
0,115,0,5695,4,,,8,4,154
1,115,56943,5719,5,37000.0,3.0,8,4,171
2,115,118363,131,3,55000.0,3.0,4,4,134 152 95
3,115,131167,7863,6,19000.0,3.0,4,4,134 107 84
4,115,137965,7925,7,11000.0,3.0,4,4,134 152 95


In [16]:
import gc
gc.collect()

76

Store the processed csv

In [23]:
train_df.to_csv("train_df.csv")

In [7]:
train_df = pd.read_csv("train_df.csv")

### Group by User

In [17]:
user_groups = train_df.groupby("user_id")

In [18]:
user_groups.head()

Unnamed: 0,user_id,timestamp,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,part,answered_correctly,tags
0,115,0,5695,4,,,8,4,154
1,115,56943,5719,5,37000.0,3,8,4,171
2,115,118363,131,3,55000.0,3,4,4,134 152 95
3,115,131167,7863,6,19000.0,3,4,4,134 107 84
4,115,137965,7925,7,11000.0,3,4,4,134 152 95
...,...,...,...,...,...,...,...,...,...
50842984,2147482888,0,6150,3,,,8,4,119
50842985,2147482888,21320,4795,4,15000.0,3,8,4,82
50842986,2147482888,44458,5741,5,18000.0,3,8,4,11
50842987,2147482888,68053,6105,6,21000.0,3,8,3,136


### Creating Sparse arrays with "sparse" package

- Diffferent array is created based on the data type for the most efficient memory footprint.
- All are converted to sparse multidimensional arrays using the "sparse" package

In [85]:
def preprocess2(rows):
    """
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
    """
    int64_feat = rows["timestamp"].values.astype(np.int64)
    int32_feat = rows["prior_question_elapsed_time"].values.astype(np.int32)
    int16_feat = np.stack([
        rows["content_id"].values.astype(np.int16), 
        rows["task_container_id"].values.astype(np.int16), 
    ])
    int8_feat = np.stack([
        rows["prior_question_had_explanation"].fillna(-1).values.astype(np.int8),
        rows["part"].values.astype(np.int8),
        rows["answered_correctly"].values.astype(np.int8)
    ])
    
    return int64_feat, int32_feat, int16_feat, int8_feat
    

In [86]:
ug, udf = next(iter(user_groups))

In [87]:
out1 = preprocess2(udf)

In [88]:
sum([out1[i].nbytes for i in range(4)])

874

In [89]:
out_sparse = sparse.COO.from_numpy(out1[0])

In [90]:
out_sparse

0,1
Format,coo
Data Type,int64
Shape,"(46,)"
nnz,45
Density,0.9782608695652174
Read-only,True
Size,720
Storage ratio,2.0


Split each features into its most efficient data type

In [91]:
user_groups_arr = user_groups.apply(preprocess2)

In [92]:
user_groups_arr.values[0][0]

array([        0,     56943,    118363,    131167,    137965,    157063,
          176092,    194190,    212463,    230983,    255381,    280033,
          302994,    328686,    352686,    376162,    398020,    418008,
          437272,    468511,    490100,    510583,    534187,    557677,
          575289,    597863,    621464,    645415,    670520,    692971,
          710402,    732421,   1219624,   1252621,   1284094,   1320874,
         1359412,   1415188,   1468285, 667861680, 667971812, 667971812,
       667971812, 668090043, 668090043, 668090043])

In [93]:
# del train_df
# del user_groups

In [94]:
import gc
gc.collect()

36120

In [95]:
user_groups_arr.memory_usage()/1e6

6.298496

In [96]:
int64_features = np.stack([np.pad(user_groups_arr.values[i][0], ((int(511-user_groups_arr.values[i][0].shape[0]), 0)), "constant") for i in range(len(user_groups_arr.values))])

In [97]:
int64_features.shape

(393656, 511)

In [98]:
int64_sparse = sparse.COO.from_numpy(int64_features)

In [102]:
int64_sparse.nbytes

1211775624

In [100]:
sparse.save_npz("data/int64_feat.npz", int64_sparse)

In [104]:
del int64_features
del int64_sparse

In [105]:
int32_features = np.stack([np.pad(user_groups_arr.values[i][1], ((int(511-user_groups_arr.values[i][1].shape[0]), 0)), "constant") for i in range(len(user_groups_arr.values))])

In [106]:
int32_features.shape

(393656, 511)

In [107]:
int32_sparse = sparse.COO.from_numpy(int32_features)

In [108]:
sparse.save_npz("data/int32_feat.npz", int32_sparse)

In [109]:
del int32_features
del int32_sparse

In [110]:
int16_features = np.stack([np.pad(user_groups_arr.values[i][2], ((0, 0), (int(511-user_groups_arr.values[i][2].shape[1]), 0)), "constant") for i in range(len(user_groups_arr.values))])

In [111]:
int16_features = np.transpose(int16_features, (0, 2, 1))
int16_features.shape

(393656, 511, 2)

In [112]:
int16_sparse = sparse.COO.from_numpy(int16_features)

In [113]:
sparse.save_npz("data/int16_feat.npz", int16_sparse)

In [114]:
int8_features = np.stack([np.pad(user_groups_arr.values[i][3], ((0, 0), (int(511-user_groups_arr.values[i][3].shape[1]), 0)), "constant") for i in range(len(user_groups_arr.values))])

In [115]:
int8_features = np.transpose(int8_features, (0, 2, 1))
int8_features.shape

(393656, 511, 3)

In [116]:
int8_sparse = sparse.COO.from_numpy(int8_features)

In [117]:
sparse.save_npz("data/int8_feat.npz", int8_sparse)

In [118]:
user_index = {user_id: idx for idx, user_id in enumerate(user_groups_arr.index)}
# user_index

In [119]:
len(user_index)

393656

In [120]:
with open("data/user_index.pkl", "wb") as f:
  pickle.dump(user_index, f)

#### tags

In [19]:
def preprocess3(rows):
    
    tags = rows["tags"].values
    
    tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
    # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
    # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
    tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
    tags = tags.numpy().astype(np.uint8)
    
    return tags

In [22]:
out1 = preprocess3(udf)

In [23]:
out1.shape

(46, 190)

In [58]:
out1.nbytes

8740

In [59]:
sparse.COO.from_numpy(out1).nbytes

2499

In [26]:
user_groups_arr_tags = user_groups.apply(preprocess3)

In [27]:
user_groups_arr_tags.memory_usage()

6298496

In [38]:
import sparse

In [60]:
user_groups_arr_tags.values[2].shape

(19, 190)

In [62]:
np.pad(user_groups_arr_tags.values[0], ((4, 0), (0, 0)), "constant", constant_values=1)

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

The below cell leads to memory error.

- use sparse stack on sparse arrays.
- sparse.from_numpy() to create the individual sparse arrays from padded np array.

In [70]:
tags_np = user_groups_arr_tags.values
tags_features = sparse.stack([
    sparse.COO.from_numpy(np.pad(tags_np[i], ((int(511-tags_np[i].shape[0]), 0), (0, 0)), "constant")) for i in range(len(tags_np))
])

In [80]:
tags_features.shape

(393656, 511, 190)

In [81]:
sparse.save_npz("data/tags_feat.npz", tags_features)

#### Upload to Kaggle

In [121]:
!kaggle datasets init -p data/

Data package template written to: data/dataset-metadata.json


In [122]:

# id and title only alphanumeric and "-"
meta = """
{
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ], 
  "id": "nisarahamedk/riid-state-dict-512-0-7",
  "title": "riid-state-dict-512-0-7"
}
"""
with open("data/dataset-metadata.json", "w") as f:
  f.write(meta)

In [123]:
# create
!kaggle datasets create -p data/ --dir-mode tar -u

Starting upload for file int32_feat.npz
100%|██████████████████████████████████████| 55.3M/55.3M [00:10<00:00, 5.43MB/s]
Upload successful: int32_feat.npz (55MB)
Starting upload for file int16_feat.npz
100%|████████████████████████████████████████| 154M/154M [00:26<00:00, 6.13MB/s]
Upload successful: int16_feat.npz (154MB)
Starting upload for file .ipynb_checkpoints.tar
100%|████████████████████████████████████████| 10.0k/10.0k [00:12<00:00, 841B/s]
Upload successful: .ipynb_checkpoints.tar (10KB)
Starting upload for file int64_feat.npz
100%|████████████████████████████████████████| 151M/151M [00:23<00:00, 6.70MB/s]
Upload successful: int64_feat.npz (151MB)
Starting upload for file int8_feat.npz
100%|██████████████████████████████████████| 39.2M/39.2M [00:13<00:00, 3.13MB/s]
Upload successful: int8_feat.npz (39MB)
Starting upload for file user_index.pkl
100%|███████████████████████████████████████| 3.63M/3.63M [00:06<00:00, 560kB/s]
Upload successful: user_index.pkl (4MB)
Starting uplo

### Zarr arrays

In [124]:
!pip install zarr

Collecting zarr
  Downloading zarr-2.6.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 20.4 MB/s eta 0:00:01
[?25hCollecting asciitree
  Downloading asciitree-0.3.3.tar.gz (4.0 kB)
Collecting fasteners
  Downloading fasteners-0.16-py2.py3-none-any.whl (28 kB)
Collecting numcodecs>=0.6.4
  Downloading numcodecs-0.7.2-cp38-cp38-manylinux2010_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 13.8 MB/s eta 0:00:01
Building wheels for collected packages: asciitree
  Building wheel for asciitree (setup.py) ... [?25ldone
[?25h  Created wheel for asciitree: filename=asciitree-0.3.3-py3-none-any.whl size=5035 sha256=1fa8db18ebb33da484717bbf6f9a191a0215021f2fc5569f528219408f3d0bd2
  Stored in directory: /home/nisarkavungal_gmx_com/.cache/pip/wheels/a3/d7/75/19cd0d2a893cad4bb0b2b16dd572ad2916d19c0d5ee9612511
Successfully built asciitree
Installing collected packages: asciitree, fasteners, numcodecs, zarr
Successfully installed asciitree-0.3.3 f

In [142]:
import zarr
import numcodecs

In [152]:
states = zarr.zeros((393656, 511, 9), dtype=object, object_codec=numcodecs.Pickle())

In [136]:
ug, udf = next(iter(user_groups))

In [153]:
udf.values.shape

(46, 9)

In [154]:
states[0, -46:, :] = udf.values

In [157]:
states[0, -46:, :][:5]

array([[115, 0, 5695, 4, nan, nan, 8, 4, '154'],
       [115, 56943, 5719, 5, 37000.0, 3, 8, 4, '171'],
       [115, 118363, 131, 3, 55000.0, 3, 4, 4, '134 152 95'],
       [115, 131167, 7863, 6, 19000.0, 3, 4, 4, '134 107 84'],
       [115, 137965, 7925, 7, 11000.0, 3, 4, 4, '134 152 95']],
      dtype=object)

In [None]:
for i, (ug, udf) in enumerate(user_groups):
    states[i, -len(udf):, :] = udf.values

In [None]:
states

#### Old

In [21]:
def preprocess(rows):
#     reg_features = np.stack([
#         rows["timestamp"].values.astype(dtypes_train["timestamp"]),
#         rows["content_id"].values.astype(dtypes_train["content_id"]), 
#         rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
#         rows["prior_question_elapsed_time"].shift(-1, fill_value=-1).values.astype(dtypes_train["prior_question_elapsed_time"]), # last question for the user doesnt have elapsed time. fill with -1
#         rows["prior_question_had_explanation"].shift(-1, fill_value=2).values.astype(dtypes_train["prior_question_had_explanation"]), # last question for the user doesnt have "had_explanation". fill with 2
#         rows["part"].values.astype(dtypes_questions["part"]),
#         rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"])
#     ], axis=1)
    
#     tags = rows["tags"].values
    
#     tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
#     # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
#     # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
#     tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
#     tags = tags.numpy()
    
#     features = np.concatenate([reg_features, tags], axis=1)
    
#     out = np.zeros((features.shape[0]+1, features.shape[1])) # place to fill start token
#     out[1:, :] = features
#     out[0, :reg_features.shape[1]] = 1 # START TOKEN for the regular features
#     out[0, 8] = 1 # 8th position 1 indicating the one hot encoding of "1" as the start token for tags.
    
    return rows.values

In [25]:
from scipy import sparse

In [26]:
def preprocess1(rows):
    reg_features = np.stack([
        rows["timestamp"].values.astype(dtypes_train["timestamp"]),
        rows["content_id"].values.astype(dtypes_train["content_id"]), 
        rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
        rows["prior_question_elapsed_time"].values.astype(dtypes_train["prior_question_elapsed_time"]), # last question for the user doesnt have elapsed time. fill with -1
        rows["prior_question_had_explanation"].fillna(-1).values.astype(dtypes_train["prior_question_had_explanation"]), # last question for the user doesnt have "had_explanation". fill with 2
        rows["part"].values.astype(dtypes_questions["part"]),
        rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"])
    ], axis=1)
    
    tags = rows["tags"].values
    
    tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
    # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
    # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
    tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
    tags = tags.numpy().astype(np.int8)
    
    tags_sparse = sparse.coo_matrix(tags)
    
    return (reg_features, tags_sparse)

In [66]:
ug_stacked = np.stack([np.pad(user_groups_arr.values[i][0], ((int(511-user_groups_arr.values[i][0].shape[0]), 0), (0, 0)), "constant") for i in range(len(user_groups_arr.values))])

In [69]:
np.save("data/state_np_stacked", ug_stacked)

In [None]:
ug_stacked_ragged = tf.ragged.constant([user_groups_arr.values[i][0] for i in range(len(user_groups_arr.values))])

In [33]:
user_groups_arr = user_groups.groupby("user_id").apply(lambda row: row.values)

In [43]:
np.save("data/state_np1.py", user_groups_arr.values)

In [36]:
len(user_groups_arr.index)

393656

In [38]:
user_groups_arr.memory_usage()/1e6

16.784256

In [35]:
user_groups_arr[115].shape

(46, 9)

In [59]:
!mkdir data
with open("data/state_dict_series.pkl", "wb") as f:
  pickle.dump(user_groups_arr, f)

mkdir: cannot create directory ‘data’: File exists


In [63]:
pip install tables

Collecting tables
  Downloading tables-3.6.1-cp38-cp38-manylinux1_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 13.4 MB/s eta 0:00:01
[?25hCollecting numexpr>=2.6.2
  Downloading numexpr-2.7.2-cp38-cp38-manylinux2010_x86_64.whl (472 kB)
[K     |████████████████████████████████| 472 kB 91.9 MB/s eta 0:00:01
Installing collected packages: numexpr, tables
Successfully installed numexpr-2.7.2 tables-3.6.1
Note: you may need to restart the kernel to use updated packages.


In [64]:
user_groups_arr.to_hdf("data/states_series.h5", key="states")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->values] [items->None]

  pytables.to_hdf(


In [2]:
user_groups_arr = pd.read_hdf("data/states_series.h5", "states")

In [3]:
user_groups_arr.head()

user_id
115     [[115, 0, 5695, 4, nan, nan, 8, 4, 154], [115,...
124     [[124, 0, 7903, 3, nan, nan, 4, 4, 134 96 84],...
2746    [[2746, 0, 5276, 3, nan, nan, 8, 3, 48], [2746...
5382    [[5382, 0, 5003, 3, nan, nan, 8, 4, 29], [5382...
8623    [[8623, 0, 3918, 3, nan, nan, 8, 4, 99], [8623...
dtype: object

In [65]:
user_groups_df = user_groups_arr.to_frame()

In [75]:
user_groups_df.head()

Unnamed: 0_level_0,0
user_id,Unnamed: 1_level_1
115,"[[115, 0, 5695, 4, nan, nan, 8, 4, 154], [115,..."
124,"[[124, 0, 7903, 3, nan, nan, 4, 4, 134 96 84],..."
2746,"[[2746, 0, 5276, 3, nan, nan, 8, 3, 48], [2746..."
5382,"[[5382, 0, 5003, 3, nan, nan, 8, 4, 29], [5382..."
8623,"[[8623, 0, 3918, 3, nan, nan, 8, 4, 99], [8623..."


In [82]:
user_groups_df = user_groups_df.rename(columns={0: "state"}).reset_index()

In [7]:
# user_groups_df.to_feather("data/states_df.feather")

In [50]:
user_groups_dict = user_groups_arr.to_dict()

In [26]:
# user_groups_dict

In [52]:
max_seq_len, c = 0, 0
for k, v in user_groups_dict.items():
  if v.shape[0] == 511:
    c += 1
  max_seq_len = max(max_seq_len, v.shape[0])

max_seq_len, c

(511, 43558)

In [53]:
import pickle
!mkdir data
with open("data/state_dict.pkl", "wb") as f:
  pickle.dump(user_groups_dict, f)

mkdir: cannot create directory ‘data’: File exists


In [54]:

state_dict = {}

with open("data/state_dict.pkl", "rb") as f:
  state_dict = pickle.load(f)

state_dict[115].shape

(46, 9)

In [8]:
user_groups_arr.head()

user_id
115     [[115, 0, 5695, 4, nan, nan, 8, 4, 154], [115,...
124     [[124, 0, 7903, 3, nan, nan, 4, 4, 134 96 84],...
2746    [[2746, 0, 5276, 3, nan, nan, 8, 3, 48], [2746...
5382    [[5382, 0, 5003, 3, nan, nan, 8, 4, 29], [5382...
8623    [[8623, 0, 3918, 3, nan, nan, 8, 4, 99], [8623...
dtype: object

In [40]:
user_groups_arr.memory_usage()/1e6

16.784256

In [39]:
user_groups_arr[115]

array([[115, 0, 5695, 4, nan, nan, 8, 4, '154'],
       [115, 56943, 5719, 5, 37000.0, 3, 8, 4, '171'],
       [115, 118363, 131, 3, 55000.0, 3, 4, 4, '134 152 95'],
       [115, 131167, 7863, 6, 19000.0, 3, 4, 4, '134 107 84'],
       [115, 137965, 7925, 7, 11000.0, 3, 4, 4, '134 152 95'],
       [115, 157063, 159, 8, 5000.0, 3, 4, 4, '134 104 165 41'],
       [115, 176092, 54, 9, 17000.0, 3, 4, 4, '134 190 84'],
       [115, 194190, 53, 10, 17000.0, 3, 4, 4, '134 104 41'],
       [115, 212463, 7899, 11, 16000.0, 3, 4, 4, '134 107 165 84'],
       [115, 230983, 7866, 12, 16000.0, 3, 4, 4, '134 71 95'],
       [115, 255381, 155, 13, 17000.0, 3, 4, 3, '134 8 165 84'],
       [115, 280033, 107, 14, 22000.0, 3, 4, 3, '134 96 165 84'],
       [115, 302994, 111, 15, 23000.0, 3, 4, 3, '134 107 95'],
       [115, 328686, 7903, 16, 21000.0, 3, 4, 4, '134 96 84'],
       [115, 352686, 7904, 17, 24000.0, 3, 4, 4, '134 133 95'],
       [115, 376162, 7974, 18, 22000.0, 3, 4, 4, '134 96 95'],
     

In [22]:
from scipy import sparse

In [34]:
user_groups_arr1 = user_groups_arr.apply(lambda row: sparse.coo_matrix(row[:, :8].astype(np.float32)))

In [41]:
user_groups_arr2 = user_groups_arr.apply(lambda row: row[:, 8])

In [43]:
user_groups_arr2[115]

array(['154', '171', '134 152 95', '134 107 84', '134 152 95',
       '134 104 165 41', '134 190 84', '134 104 41', '134 107 165 84',
       '134 71 95', '134 8 165 84', '134 96 165 84', '134 107 95',
       '134 96 84', '134 133 95', '134 96 95', '12 13 84',
       '64 113 165 32', '12 13 95', '13 181 32', '134 8 165 41',
       '134 107 84', '13 167 165 84', '134 114 84', '134 39 165 84',
       '13 97 95', '13 97 165 95', '13 167 84', '134 102 32', '12 13 95',
       '134 8 84', '134 8 95', '134 96 165 32', '134 190 32',
       '134 96 165 95', '134 133 165 41', '64 113 165 95', '54 134 105',
       '54 134 95', '146 143 84 32', '139 165 95 32', '160 95 32',
       '139 95 32', '77 106 32', '139 106 32', '139 106 32'], dtype=object)

In [46]:
with open("data/state_feat1.pkl", "wb") as f: # 4.96 GB
  pickle.dump(user_groups_arr1.to_dict(), f)

In [44]:
user_groups_arr1.to_hdf("data/state_feat1.h5", key="states1")
user_groups_arr2.to_hdf("data/state_feat2.h5", key="states2")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->values] [items->None]

  pytables.to_hdf(


OverflowError: value too large to convert to int

In [40]:
user_groups_arr1[115].toarray() == user_groups_arr[115][:, :8].astype(np.float32)

array([[ True,  True,  True,  True, False, False,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True],
       [ True,  True,  Tr

In [32]:
s = sparse.coo_matrix(user_groups_arr[115][:, :7].astype(np.float32))

In [33]:
s.toarray()

array([[1.1500000e+02, 0.0000000e+00, 5.6950000e+03, 4.0000000e+00,
                  nan,           nan, 8.0000000e+00],
       [1.1500000e+02, 5.6943000e+04, 5.7190000e+03, 5.0000000e+00,
        3.7000000e+04, 3.0000000e+00, 8.0000000e+00],
       [1.1500000e+02, 1.1836300e+05, 1.3100000e+02, 3.0000000e+00,
        5.5000000e+04, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 1.3116700e+05, 7.8630000e+03, 6.0000000e+00,
        1.9000000e+04, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 1.3796500e+05, 7.9250000e+03, 7.0000000e+00,
        1.1000000e+04, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 1.5706300e+05, 1.5900000e+02, 8.0000000e+00,
        5.0000000e+03, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 1.7609200e+05, 5.4000000e+01, 9.0000000e+00,
        1.7000000e+04, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 1.9419000e+05, 5.3000000e+01, 1.0000000e+01,
        1.7000000e+04, 3.0000000e+00, 4.0000000e+00],
       [1.1500000e+02, 2

In [19]:
!pip install scipy

Collecting scipy
  Downloading scipy-1.5.4-cp38-cp38-manylinux1_x86_64.whl (25.8 MB)
[K     |████████████████████████████████| 25.8 MB 4.2 MB/s eta 0:00:01
Installing collected packages: scipy
Successfully installed scipy-1.5.4
