In [25]:
import pandas as pd
import numpy as np
import datetime
import time
import math
import geocoder

In [29]:
DISCOUNT = 90 #events become half as valuable after 3 months

activity_df = pd.read_csv('activity_data.csv')
hotel_df = pd.read_csv('hotel_data.csv').set_index('hotel_id')

names = {'parent_brand_name':['Hyatt Hotels Corporation'
,'Preferred Hotel Group','Wyndham Hotel Group'
,'Choice Hotels International, Inc.','InterContinental Hotels Group PLC'
,'Hilton Worldwide','Marriott International, Inc.'],
        'hotel_type':['B&B','Hostel','Condo','Hotel']} 

def encode_values(data, feature_list): #returns columns of one-hot-encoded data with preselected features
    keylist = []
    for key in feature_list.keys():
        if key in data.columns:
            data.loc[(~data[key].isin(feature_list[key]))&(~data[key].isnull()),key] ='other'
            data.loc[data[key].isnull(),key] = 'null'
            keylist.append(key)
    return pd.get_dummies(data, columns = keylist).select_dtypes('uint8')

In [30]:
to_timestamp = lambda x: datetime.datetime.strptime(x,'%Y-%m-%d').timestamp()
activity_df['timestamp'] = activity_df['date'].apply(to_timestamp)
#PUT IN GEOLOCATOR LATER FOR HOTELS AND FOR USERS

def encode(a):
    encodings = {'view':1, 'price_click':2, 'hotel_website_click':3, 'booking':4}
    return encodings[a]

activity_df['user_action'] = activity_df['user_action'].apply(encode)
activity_df = pd.get_dummies(activity_df, columns = ['device'])
activity_df['event_count'] = 1

def geocode(a):
    g=geocoder.arcgis(a)
    return g.lat, g.lng

#activity_df['usr_lat'], activity_df['usr_long'] = activity_df['user_country'].apply(geocode)
activity_df = activity_df.drop(columns=['user_country','date'])

current_time = time.time()
decay = lambda coef,x: coef*math.e**((current_time-x)/-(86400000*DISCOUNT/0.693147181))
cols = list(set(activity_df.columns)-set(['user_id','hotel_id','timestamp','event_count']))

for col in cols:
    activity_df[col] = decay(activity_df[col], activity_df['timestamp'])
   
#VECTOR EMBEDDING OF HOTEL NAME USING NLP AS FEATURE LATER ON
hotel_df = hotel_df.join(encode_values(hotel_df, names))
hotel_df = hotel_df.drop(columns = ['brand_name']+list(names.keys()))

df = activity_df.join(hotel_df, how = 'inner', on = 'hotel_id')
df = df.drop(columns = ['timestamp'])
df = df.groupby(['user_id','hotel_id']).sum()
df['user_action'] = df['user_action']/df['event_count']
df.count()

user_action                                            1220985
device_android_browser                                 1220985
device_android_hybrid_app                              1220985
device_android_native_app                              1220985
device_android_tablet_browser                          1220985
device_android_tablet_hybrid_app                       1220985
device_android_tablet_native_app                       1220985
device_ipad_browser                                    1220985
device_ipad_hybrid_app                                 1220985
device_ipad_native_app                                 1220985
device_iphone_browser                                  1220985
device_iphone_hybrid_app                               1220985
device_iphone_native_app                               1220985
device_linux                                           1220985
device_osx                                             1220985
device_other                                           

In [24]:
rows = df.sample(frac =.10)
hotel_ids = list(set(rows.index.get_level_values('hotel_id')))
user_ids = list(set(rows.index.get_level_values('user_id')))
print(len(hotel_ids))
print(len(user_ids))

arr =pd.DataFrame(0, index = user_ids, columns = hotel_ids)
arr.head()

for uid in arr.index:
    for hid in arr.columns:
        if df.index.isin([(7, 5000)]).any():
            arr.loc[uid,hid] = df[(uid,hid),]

912
103664


Unnamed: 0,251906,313346,3235844,247814,1007626,9015308,6717455,9984018,10149907,2600980,...,10022886,8085477,1644524,1732589,2109422,2441200,8722421,208888,251899,247805
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262145,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
