In [38]:

from random import shuffle

import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding
from keras.callbacks import ModelCheckpoint
import pandas as pd
import numpy as np
import scipy.sparse

In [48]:
data_path = '/users/PAS1315/osu9187/wsdm/New_Data/'
members = pd.read_csv(data_path + 'mem_shortlist.csv').fillna('')
print("Data Loaded")

members['registration_year'] = members['registration_init_time'].apply(lambda x: int(str(x)[0:4]))

members['expiration_year'] = members['expiration_date'].apply(lambda x: int(str(x)[0:4]))
members = members.drop(['registration_init_time'], axis=1)
members = members.drop(['expiration_date'], axis=1)

print members.head()
print len(members)

Data Loaded
                                           msno  city  bd gender  \
0  XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=     1   0          
1  UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=     1   0          
2  D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=     1   0          
3  mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=     1   0          
4  q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=     1   0          

   registered_via  registration_year  expiration_year  
0               7               2011             2017  
1               7               2015             2017  
2               4               2016             2017  
3               9               2015             2015  
4               4               2017             2017  
34403


In [40]:
# Some of the rows corresponding to a column have multiple values separated by '|'
# character. We need to split and separate these multiple values

def get_unique_entities(data, column):
    unique = data[column].unique()
    return unique

In [41]:
def to_one_hot(batch_rows, mappers):
    batch_size = batch_rows.shape[0]
    one_hot = [None]*batch_rows.shape[1]

    for i in range(len(one_hot)):
        one_hot[i] = np.zeros((batch_size, len(mappers[i])))
    
    row_num = 0
    for (_, row) in batch_rows.iterrows():
        for (i, element) in enumerate(row):
            one_hot[i][row_num][mappers[i][element]] = 1
        row_num += 1
            
    return (one_hot[0], one_hot[1:])

def generate_mapper(data, column):
    unique_elements = get_unique_entities(data, column)
    mapper = dict()
    mapper['<unk>'] = 0
    for u in unique_elements:
        mapper[u] = len(mapper)
    return mapper

In [42]:
city_mapper = generate_mapper(members, 'city')
msno_mapper = generate_mapper(members, 'msno')
reg_via_mapper = generate_mapper(members, 'registered_via')
reg_year_mapper = generate_mapper(members, 'registration_year')

mappers = [msno_mapper, city_mapper, reg_via_mapper, reg_year_mapper]

In [43]:
def batch_generator(data, input_columns, target_columns, mappers, batch_size):
    num_rows = data.shape[0]
    num_inputs = len(input_columns)
    num_outputs = len(target_columns)
    all_columns = input_columns+target_columns
    permutation = np.random.permutation(num_rows)
    
    while True:
        count = 0
        while count<=int(data.shape[0]/batch_size):
            batch_indices = permutation[count*batch_size:min((count+1)*batch_size, num_rows)]
            batch = data[all_columns].iloc[batch_indices]
            count += 1
            yield to_one_hot(batch, mappers)

In [44]:
class LossHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))

In [45]:
batch_size = 64
input_generator = batch_generator(members, ['msno'],
                    ['city', 'registered_via', 'registration_year'],
                    mappers, batch_size)

In [46]:
input_col = 'msno'
input_shape = len(mappers[0])
output_shapes = [len(mappers[1]), len(mappers[2]), len(mappers[3])]
num_hidden_units = 128
hidden_activation = 'relu'
dropout = 0.5
batch_size = 64

input_features = Input(shape = (input_shape,))
hidden = Dropout(dropout)(
    Dense(num_hidden_units,activation=hidden_activation)(input_features))
output_0 = Dense(output_shapes[0], activation='softmax')(hidden)
output_1 = Dense(output_shapes[1], activation='softmax')(hidden)
output_2 = Dense(output_shapes[2], activation='softmax')(hidden)

model = keras.models.Model(inputs = [input_features],
                           outputs = [output_0, output_1, output_2])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print("model compiled")

model compiled


In [47]:
lossHistory = LossHistory()
weights_saver = ModelCheckpoint(filepath='./model_user_embeddings/weights.{epoch:02d}.hdf5', verbose=1, period=5)

model.fit_generator(input_generator, steps_per_epoch=members.shape[0]/batch_size, epochs=1, callbacks=[lossHistory, weights_saver])

Epoch 1/1


<keras.callbacks.History at 0xa0a2050>

In [49]:
print lossHistory.losses

[5.5930583445941489]
