In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

Here we are going to do the followings about the categoricalfeatures of users:
1. fill the Nans of each columns with its 'mode'(only c5 and c6 got Nans)
2. then each features is 'onehoted' and stored in as a 'sparse matrix'

In [2]:
df_users = pd.read_csv( 'users.csv', usecols=[0,1,2,3,4,5,6] )
modes = ["lo4tWwhoe/Q=","TxgzQwAiMiM=","coyERY/CJaE=","yk3oTcFByB8=","zAYUYHDsV6s=","7Xs3zpQ+FE8="]
for idx in range(6):
    col_name = 'C{}'.format( idx+1 )
    df_users[col_name] = df_users[col_name].fillna( modes[idx] )

In [5]:
df_users.user_id.is_monotonic

True

In [4]:
#
c_dict = dict()
for idx in range(6):
    col_name = 'C{}'.format( idx+1 )
    _uniques = df_users[col_name].unique()
    c_dict[ col_name ] = dict(zip( _uniques, range(len(_uniques)) ))
# Replace Modes
for idx in range(6):
    col_name = 'C{}'.format( idx+1 )
    if( np.nan in c_dict[col_name] ):
        c_dict[col_name][np.nan] = c_dict[col_name][modes[idx]]

In [5]:
sparses = dict()

In [6]:
for col_name in c_dict:
    # Report
    print( col_name )
    # Skip
    if( col_name in sparses ):
        continue
    # Look-up-Table
    lut = c_dict[col_name]
    # Create Sparse Matrix
    _sparse = sps.dok_matrix( (len(df_users),len(lut)), dtype=np.bool )
    # Fill
    for idx_row, value in enumerate(df_users[col_name]):
        idx_col = lut[value]
        _sparse[idx_row,idx_col] = True
    # Store
    sparses[col_name] = sps.csr_matrix( _sparse, dtype=np.bool )

C1
C2
C3
C4
C5
C6


In [7]:
sparse_all = sps.csr_matrix(
    sps.hstack( [ sparses['C{}'.format(idx+1)] for idx in range(6) ], dtype=np.bool ),
    dtype = np.bool
)

# Sparse matrix of OneHoteEncoder of Categorical features

In [8]:
sps.save_npz( 'users_cats.npz', sparse_all )

In [9]:
X = sps.load_npz( 'users_cats.npz' )

In [16]:
np.array(X[10:40].todense(), dtype=np.uint)

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=uint64)

# Data Generating Function for AutoEncoder

In [36]:
# Encoder Data Generator
class EncoderDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, X_dense, X_sparse, batch_size = 32, shuffle = True):
        'Initialization'
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.X_dense = X_dense
        self.X_sparse = X_sparse
        
        self.n = X_dense.shape[0] if (not X_dense is None) else X_sparse.shape[0]
        
        self.on_epoch_end()
    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.n // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indices of the batch
        indices = self.indices[ index*self.batch_size : (index+1)*self.batch_size ]
        # Cases
        if( self.X_dense is None ):
            _X = self.X_sparse[indices].todense()
        elif( self.X_sparse is None ):
            _X = self.X_dense[indices]
        else:
            _X = np.hstack( (self.X_dense[indices], self.X_sparse[indices].todense()) )
        # Change Type
        _X = _X.astype( np.float )
        return _X, _X

    def on_epoch_end(self):
        'Updates indices after each epoch'
        self.indices = np.arange(self.n)
        if self.shuffle == True:
            np.random.shuffle(self.indices)

In [43]:
data_generator = EncoderDataGenerator( None, X_sparse, batch_size = 2**10 )
# data_generator = EncoderDataGenerator( X_dense, None, batch_size = 32 )
# data_generator = EncoderDataGenerator( X_dense, X_sparse, batch_size = 32 )