In [None]:
%matplotlib inline
import pandas as pd

In [None]:
df = pd.read_csv('merged_dataset.csv')
print(df.columns)


In [None]:
len(df)

In [None]:
def premium_boost(x):
    return 2 if (x['user_premium']) else 1

def clicked(x):
    return x['interaction_type'] == 1

def bookmarked(x):
    return x['interaction_type'] == 2

def replied(x):
    return x['interaction_type'] == 3

def recruiter_interest(x):
    return x['interaction_type'] == 5

def deleted(x):
    return x['interaction_type'] == 4

def user_success(x):
    return premium_boost(x) * (1 if clicked(x) else 0 + 5 if (bookmarked(x) or replied(x)) else 0 + 20 if recruiter_interest(x) else 0 - 10 if deleted else 0)

def item_success(x):
    return 5 if x["item_is_payed"] else 0

def score(x):
    return item_success(x) + user_success(x)

In [None]:
#scores = df.apply(score, axis=1)

In [None]:
#print(scores.mean())
#print(scores.max())
#print(scores.min())
#scores.plot.hist()

In [None]:
print(df.columns)

In [None]:
df['user_country'] = df['user_country'].astype('category').cat.rename_categories([0,1,2,3])
df['item_country'] = df['item_country'].astype('category').cat.rename_categories([0,1,2,3])

In [None]:
df

In [None]:
def n_hot_encode(column):
    # get unique elements from df[column] and put them into a set
    _set = set()

    for _list in df[column]:
        for _elem in _list:
            _set.add(_elem)

    # for each row, compute it's encoded vector indicating with a 
    # 1 that it contains that element from the set and a 0 that it doesn't
    _vectors = []
    
    for i in df.index:
        _vectors.append([1 if _elem in df[column][i] else 0 for _elem in _set])

    return _vectors

In [None]:
from multiprocessing import Pool

p = Pool(4)
results = p.map(n_hot_encode, [
    'user_title',
    'item_title',
    'item_tags',
    'user_edu_fieldofstudies'
])
df['nhot_user_title'] = results[0]
df['nhot_item_title'] = results[1]
df['nhot_item_tags'] = results[2]
df['nhot_user_edu_fieldofstudies'] = results[3]

In [None]:
print(df.interaction_type.value_counts())
print(df.columns)

In [None]:
# save df with nhot encoded columns
df.to_csv('nhot_df.csv')

In [None]:
%%time
import ast
import pandas as pd
import numpy as np

data = np.zeros(shape=(26614314, 73))
target = np.zeros(shape=(26614314,))
next_index = 0
chunk_size = 1000000

for df_chunk in pd.read_csv('nhot_df.csv', chunksize=chunk_size):
    columns = np.setdiff1d(df_chunk.columns.values, ['Unnamed: 0', 'user_title', 'item_title', 'user_edu_fieldofstudies', 'item_tags', 'nhot_user_title', 'nhot_item_title', 'nhot_user_edu_fieldofstudies', 'nhot_item_tags', 'interaction_type', 'item_created_at', 'interaction_created_at'])

    def full_encoding(row):
        regular_columns = [row[col] for col in columns]
        nhot_columns = \
            ast.literal_eval(row['nhot_user_title']) + \
            ast.literal_eval(row['nhot_item_title']) + \
            ast.literal_eval(row['nhot_user_edu_fieldofstudies']) + \
            ast.literal_eval(row['nhot_item_tags'])
        return regular_columns + nhot_columns

    for index, row in df_chunk.iterrows():
        data[next_index] = full_encoding(row)
        target[next_index] = row['interaction_type']
        next_index += 1


In [None]:
# import numpy as np
# columns = np.setdiff1d(df.columns.values, ['Unnamed: 0', 'user_title', 'item_title', 'user_edu_fieldofstudies', 'item_tags', 'nhot_user_title', 'nhot_item_title', 'nhot_user_edu_fieldofstudies', 'nhot_item_tags', 'interaction_type', 'item_created_at', 'interaction_created_at'])
# columns

In [None]:
# def full_encoding(row):
#     regular_columns = [row[col] for col in columns]
#     nhot_columns = list(row['nhot_user_title']) + list(row['nhot_item_title']) + list(row['nhot_user_edu_fieldofstudies']) + list(row['nhot_item_tags'])
#     return regular_columns + nhot_columns

In [None]:
#encoded_df = df.apply(full_encoding, axis=1)

In [None]:
# data = np.zeros(shape=(len(encoded_df), len(encoded_df[0])))

# # fill array with data taken from the df
# for i, row in enumerate(encoded_df):
#     data[i] = np.asarray(row)

In [None]:
# target = df['interaction_type'].values

In [None]:
%%time
# save array to be used for training
print(data.shape)

save_x_file = open('data.npy', 'wb')
np.save(save_x_file, data)

save_y_file = open('target.npy', 'wb')
np.save(save_y_file, target)

In [1]:
%%time
# load array for training
import numpy as np

save_x_file = open('data.npy', 'rb')
data = np.load(save_x_file)

save_y_file = open('target.npy', 'rb')
target = np.load(save_y_file)

CPU times: user 140 ms, sys: 7.58 s, total: 7.72 s
Wall time: 7.37 s


In [2]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# remove rows with nan values
# NOT NECESSARY BECAUSE WE'RE NOT USING THE TIMESTAMPS WHICH CONTAINED nan VALUES

#print(encoded_df.shape)
#clean_df = encoded_df[np.isnan(data).any(axis=1)]
#print(clean_df.shape)

In [4]:
%%time
# resample the data to get more balanced classes
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.datasets import make_imbalance
from collections import Counter

def ratio_multiplier(y):
    multiplier = {0: 0.1, 1: 1, 2: 1, 3: 1, 4: 0.1, 5: 1}
    target_stats = Counter(y)
    for key, value in target_stats.items():
        target_stats[key] = int(value * multiplier[key])
    return target_stats

print(sorted(Counter(target).items()))
x_resampled, y_resampled = RandomUnderSampler(random_state=42, ratio=ratio_multiplier).fit_sample(data, target)
print(sorted(Counter(y_resampled).items()))
x_resampled, y_resampled = RandomOverSampler(random_state=42).fit_sample(x_resampled, y_resampled)
print(sorted(Counter(y_resampled).items()))

[(0.0, 20631504), (1.0, 4717347), (2.0, 264646), (3.0, 91526), (4.0, 906799), (5.0, 2492)]
[(0.0, 2063150), (1.0, 4717347), (2.0, 264646), (3.0, 91526), (4.0, 90679), (5.0, 2492)]
[(0.0, 4717347), (1.0, 4717347), (2.0, 4717347), (3.0, 4717347), (4.0, 4717347), (5.0, 4717347)]
CPU times: user 42.3 s, sys: 8.24 s, total: 50.5 s
Wall time: 49.9 s


In [5]:
from keras.utils import to_categorical

y_resampled = to_categorical(y_resampled)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    x_resampled, 
    y_resampled, 
    test_size=0.2
)

In [7]:
input_dim = x_resampled.shape[1]
model = Sequential()

model.add(Dense(1024, activation='relu', input_dim=input_dim))
model.add(Dropout(0.25))
model.add(Dense(1024, activation='relu', input_dim=input_dim))
model.add(Dropout(0.25))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(y_resampled.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
input_dim = x_resampled.shape[1]
model = Sequential()

model.add(Dense(1024, activation='relu', input_dim=input_dim))
model.add(Dropout(0.25))
model.add(Dense(1024, activation='relu', input_dim=input_dim))
model.add(Dropout(0.25))
#model.add(Dense(256, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(y_resampled.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [8]:
# add a callback to stop if we start overfitting

from keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.00001, patience=1, verbose=0, mode='auto')]

In [None]:
#1024,1024
history = model.fit(
    x_train, 
    y_train, 
    validation_data=(x_test, y_test), 
    epochs=20, 
    batch_size=128,
    callbacks=callbacks)

In [None]:
#512,512,256
history = model.fit(
    x_train, 
    y_train, 
    validation_data=(x_test, y_test), 
    epochs=20, 
    batch_size=128,
    callbacks=callbacks)

Train on 22643265 samples, validate on 5660817 samples
Epoch 1/20
Epoch 2/20
    1792/22643265 [..............................] - ETA: 5:27:23 - loss: 0.9862 - acc: 0.5876 

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']
acc = history_dict['acc']
val_acc = history_dict['val_acc']
epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, label='training loss')
plt.plot(epochs, val_loss, label='validation loss')
plt.plot(epochs, acc, label='training acc')
plt.plot(epochs, val_acc, label='validation acc')
plt.legend()
plt.show()

In [None]:
#sample = np.expand_dims(data[0], axis=0)  # predict one instance
print(model.predict(x_test[:2]))  # predict some instances

In [None]:
arr = model.predict(x_test[1:2])[0]
print(arr.max(), np.argmax(arr), arr.min(), np.argmin(arr))

In [None]:
predictions = model.predict(x_test)

In [None]:
np.unique(predictions, axis=0).shape