In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Read the train and the test data 
train_users = pd.read_csv('train_users_2.csv')
test_users = pd.read_csv('test_users.csv')


# Extracting labels from the train data
train_users_labels = train_users.loc[:,'country_destination']
print (train_users_labels.head(n=5))

# Extracting attributes from the train data
train_users_attrs = train_users.iloc[:,0:15]
print(train_users_attrs.head(n=5))

train_users = train_users_attrs

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object
           id date_account_created  timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28          20090319043255                NaN   
1  820tgsjxq7           2011-05-25          20090523174809                NaN   
2  4ft3gnwmtx           2010-09-28          20090609231247         2010-08-02   
3  bjjt8pjhuk           2011-12-05          20091031060129         2012-09-08   
4  87mebub9p4           2010-09-14          20091208061105         2010-02-18   

      gender  age signup_method  signup_flow language affiliate_channel  \
0  -unknown-  NaN      facebook            0       en            direct   
1       MALE   38      facebook            0       en               seo   
2     FEMALE   56         basic            3       en            direct   
3     FEMALE   42      facebook            0       en            direct   
4  -unknown-   41         basic           

In [4]:
# Date is split into 3 parts as year, month and day in both test and train. These are added as
# new features in both test and train

date_acc_created = np.vstack(train_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
train_users['created_year'] = date_acc_created[:,0]
train_users['created_month'] = date_acc_created[:,1]
train_users['created_day'] = date_acc_created[:,2]
train_users = train_users.drop(['date_account_created'], axis=1)

date_acc_created_test = np.vstack(test_users.date_account_created.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
test_users['created_year'] = date_acc_created_test[:,0]
test_users['created_month'] = date_acc_created_test[:,1]
test_users['created_day'] = date_acc_created_test[:,2]
test_users = test_users.drop(['date_account_created'], axis=1)

In [5]:
# Replacing unknown values in gender with -1 and null values with -1
train_users.loc[ train_users['gender'] == '-unknown-', 'gender'] = -1
train_users.loc[ train_users['gender'].isnull(), 'gender' ] = -1
test_users.loc[ test_users['gender'] == '-unknown-', 'gender'] = -1
test_users.loc[ test_users['gender'].isnull(), 'gender'] = -1

In [6]:
# Encoding Female with 0, Male with 1 and Other with 2 in both test and train data
gender_translation = {'FEMALE' : 0,
                     'MALE' : 1,
                     'OTHER' : 2,
                     -1 : -1 }
for data in [train_users, test_users]:
    data['gender'] = data['gender'].apply(lambda x: gender_translation[x])

In [7]:
# Finding valid values for gender and invalid values for gender
nan_gender_count = len(train_users.loc[train_users['gender'] == -1, 'gender'])
valid_gender_count = len(train_users.gender.values) - nan_gender_count

# Creating a map with the gender distribution
count_map = pd.value_counts(train_users['gender'].values)
print ("Existing gender value distribution")
for k, v in count_map.iteritems():
    if k == -1:
        continue
    print (k, ":", float(v)/float(valid_gender_count))

Existing gender value distribution
(0, ':', 0.5353209412124351)
(1, ':', 0.46228441870536585)
(2, ':', 0.002394640082198993)


In [8]:
# Making the gender distribution the same for missing imputation
for k, v in count_map.iteritems():
    if k == -1:
        continue
    c = int ( nan_gender_count * float(v)/float(valid_gender_count) )
    for i in range(len(train_users.gender.values)):
        if train_users.gender.values[i] == -1:
            train_users.gender.values[i] = k
            c -= 1
        if c == 0:
            break
train_users.gender.values[213450] = 0

In [9]:
train_users.gender.describe()

count    213451.000000
mean          0.467072
std           0.503691
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           2.000000
Name: gender, dtype: float64

In [10]:
nan_gender_count = len(test_users.loc[test_users['gender'] == -1, 'gender'])
valid_gender_count = len(test_users.gender.values) - nan_gender_count
count_map = pd.value_counts(test_users['gender'].values)
print ("Existing gender value distribution")
for k, v in count_map.iteritems():
    if k == -1:
        continue
    print (k, ":", float(v)/float(valid_gender_count))

for k, v in count_map.iteritems():
    if k == -1:
        continue
    c = int ( nan_gender_count * float(v)/float(valid_gender_count) )
    for i in range(len(test_users.gender.values)):
        if test_users.gender.values[i] == -1:
            test_users.gender.values[i] = k
            c -= 1
        if c == 0:
            break
test_users.gender.values[62094] = 0

Existing gender value distribution
(0, ':', 0.5116944601469757)
(1, ':', 0.486468343697004)
(2, ':', 0.0018371961560203504)


In [11]:
train_users['age'].describe()

count    125461.000000
mean         49.668335
std         155.666612
min           1.000000
25%          28.000000
50%          34.000000
75%          43.000000
max        2014.000000
Name: age, dtype: float64

In [12]:
# Replacing invalid age with NaN in test and train

train_users.loc[train_users['age'] > 95, 'age'] = np.nan
train_users.loc[train_users['age'] < 16, 'age'] = np.nan
test_users.loc[test_users['age'] > 95, 'age'] = np.nan
test_users.loc[test_users['age'] < 16, 'age'] = np.nan

In [13]:
# Replace missing age with median
print (train_users.age.median())
print (test_users.age.median())
train_users.loc[ train_users['age'].isnull(), 'age' ] = train_users.age.median()
test_users.loc[ test_users['age'].isnull(), 'age' ] = test_users.age.median()

34.0
31.0


In [14]:
# Encoding the signup method for test
signup_translation = {'facebook' : 0,
                     'google' : 1,
                     'basic' : 2,
                     'weibo' : 3}
for data in [train_users, test_users]:
    data['signup_method'] = data['signup_method'].apply(lambda x: signup_translation[x])

In [15]:
# Encoding the language in both train and test
test_users.loc[ test_users['language'] == '-unknown-', 'language'] = "en"

In [16]:
language_encoding = {'en'      :       1       ,
'zh'      :       2       ,
'fr'      :       3       ,
'es'      :       4       ,
'ko'      :       5       ,
'de'      :       6       ,
'it'      :       7       ,
'ru'      :       8       ,
'pt'      :       9       ,
'ja'      :       10      ,
'sv'      :       11      ,
'nl'      :       12      ,
'tr'      :       13      ,
'da'      :       14      ,
'pl'      :       15      ,
'cs'      :       16      ,
'no'      :       17      ,
'el'      :       18      ,
'th'      :       19      ,
'id'      :       20      ,
'hu'      :       21      ,
'fi'      :       22      ,
'ca'      :       23      ,
'is'      :       24      ,
'hr'      :       25}

for data in [train_users, test_users]:
    data['language'] = data['language'].apply(lambda x: language_encoding[x])


In [17]:
affiliate_channel_encoding = {'direct' : 1,
                             'sem-brand' : 2,
                             'sem-non-brand' : 3,
                             'other' : 4,
                             'api' : 5,
                             'seo' : 6,
                             'content' : 7,
                             'remarketing' : 8}
for data in [train_users, test_users]:
    data['affiliate_channel'] = data['affiliate_channel'].apply(lambda x: affiliate_channel_encoding[x])

In [18]:
affiliate_provider_encoding = {'direct':1,
'google':2,
'other':3,
'craigslist':4,
'bing':5,
'facebook':6,
'vast':7,
'padmapper':8,
'facebook-open-graph':9,
'yahoo':10,
'gsp':11,
'meetup':12,
'email-marketing':13,
'naver':14,
'baidu':15,
'yandex':16,
'wayn':17,
'daum':18}

for data in [train_users, test_users]:
    data['affiliate_provider'] = data['affiliate_provider'].apply(lambda x: affiliate_provider_encoding[x])


In [19]:
train_users.loc[ train_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"
test_users.loc[ test_users['first_affiliate_tracked'].isnull(), 'first_affiliate_tracked'] = "untracked"
first_affiliate_tracked_encoding = {'untracked' : 1,
                                   'linked' : 2,
                                   'omg' : 3,
                                   'tracked-other' : 4,
                                   'product' : 5,
                                   'marketing' : 6,
                                   'local ops' : 7}
for data in [train_users, test_users]:
    data['first_affiliate_tracked'] = data['first_affiliate_tracked'].apply(lambda x: first_affiliate_tracked_encoding[x])


In [20]:
signup_app_encoding = {'Web' : 1,
                      'iOS' : 2,
                      'Android' : 3,
                      'Moweb' : 4}
for data in [train_users, test_users]:
    data['signup_app'] = data['signup_app'].apply(lambda x: signup_app_encoding[x])


In [21]:
first_device_type_encoding = { 'Mac Desktop' : 1,
                             'iPhone' : 2,
                             'Windows Desktop' : 3,
                             'Android Phone' : 4,
                             'iPad' : 5,
                             'Android Tablet' : 6,
                             'Other/Unknown' : 7,
                             'Desktop (Other)' : 8,
                             'SmartPhone (Other)' : 9}
for data in [train_users, test_users]:
    data['first_device_type'] = data['first_device_type'].apply(lambda x: first_device_type_encoding[x])

In [22]:
first_browser_encoding = {'Chrome':1,
'Safari':2,
'Firefox':3,
'-unknown-':4,
'IE':5,
'Mobile Safari':6,
'Chrome Mobile':7,
'Android Browser':8,
'AOL Explorer':9,
'Opera':10,
'Silk':11,
'Chromium':12,
'BlackBerry Browser':13,
'Maxthon':14,
'IE Mobile':15,
'Apple Mail':16,
'Sogou Explorer':17,
'Mobile Firefox':18,
'RockMelt':19,
'SiteKiosk':20,
'Iron':21,
'IceWeasel':22,
'Pale Moon':23,
'SeaMonkey':24,
'Yandex.Browser':25,
'CometBird':26,
'Camino':27,
'TenFourFox':28,
'wOSBrowser':29,
'CoolNovo':30,
'Avant Browser':31,
'Opera Mini':32,
'Mozilla':33,
'Comodo Dragon':34,
'TheWorld Browser':35,
'Crazy Browser':36,
'Flock':37,
'OmniWeb':38,
'SlimBrowser':39,
'Opera Mobile':40,
'Conkeror':41,
'Outlook 2007':42,
'Palm Pre web browser':43,
'Stainless':44,
'NetNewsWire':45,
'Kindle Browser':46,
'Epic':47,
'Googlebot':48,
'Arora':49,
'Google Earth':50,
'IceDragon':51,
'PS Vita browser':52,
'IBrowse' : 53,
'UC Browser' : 54,
'IBrowse': 55,
'Nintendo Browser' : 56}


for data in [train_users, test_users]:
    data['first_browser'] = data['first_browser'].apply(lambda x: first_browser_encoding[x])

In [23]:
sessions = pd.read_csv('sessions.csv')

In [24]:
# frequency of each user_id in sessions data
df = sessions['user_id'].value_counts()
print df

mxqbh3ykxl    2722
0hjoc5q8nf    2644
mjbl6rrj52    2476
l5lgm3w5pc    2424
wg9413iaux    2362
ht8alhs4lt    2335
wyv1imf8qw    2323
monrpvx2md    2264
9z4gim1s4l    2264
h0cjxc177k    2246
a0uhiojrra    2137
vcmr2jh5ix    2085
1m6xnhstmb    2019
p1183hxzc4    1938
e8h4qghxlg    1923
gey51ednme    1919
5vpuk5mssg    1876
j2cvctvqve    1861
yu5bdalz2b    1811
ejpe95pcyo    1797
r541x78s24    1792
qkbkunyzq7    1780
n4s6g3grzf    1779
bfiueza7rt    1753
b1io359wpg    1752
8ikl7vnfa3    1732
e81qfos71y    1701
s5ez13snz0    1685
93dulcecw0    1614
r0rgjqbsvp    1612
              ... 
vlji8fg52x       1
4s2v2hmngj       1
n2rrpf1t3h       1
ua4bebdziw       1
gks02el96u       1
e7l7yocdtk       1
ztvrwgyxm2       1
w5sn4qqiav       1
9o5gi1x2i4       1
kl81vani0y       1
1uaksuktr5       1
c9vanbl9nh       1
n6tcyc7thd       1
cgdsmvs4sw       1
f9ohif5u6w       1
wiru94r12h       1
l28osl4y6x       1
t9o5rwmg1k       1
hjhljq8k89       1
ah2mvtfp74       1
q7xk33e009       1
d8rix1ykp3  

In [25]:
# Updating session_count for users present in the train data
train_users['session_count'] = 0

for key,val in df.iteritems():
    train_users.loc[train_users[ 'id' ] == key, 'session_count'] = val   

In [26]:
test_users['session_count'] = 0
for key,val in df.iteritems():
    test_users.loc[test_users[ 'id' ] == key, 'session_count'] = val   

In [31]:
# encoding the timestamp value
tfa = np.vstack(train_users.timestamp_first_active.astype(str).apply(lambda x: list(map(int, 
                                                            [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
train_users['tfa_year'] = tfa[:,0]
train_users['tfa_month'] = tfa[:,1]
train_users['tfa_day'] = tfa[:,2]
train_users = train_users.drop(['timestamp_first_active'], axis=1)

tfa = np.vstack(test_users.timestamp_first_active.astype(str).apply(lambda x: list(map(int, 
                                                            [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
test_users['tfa_year'] = tfa[:,0]
test_users['tfa_month'] = tfa[:,1]
test_users['tfa_day'] = tfa[:,2]
test_users = test_users.drop(['timestamp_first_active'], axis=1)

In [32]:
train_users.loc[ train_users.date_first_booking.isnull(), 'date_first_booking'] = '0-0-0'
dfb = np.vstack(train_users.date_first_booking.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
train_users['dfb_year'] = dfb[:,0]
train_users['dfb_month'] = dfb[:,1]
train_users['dfb_day'] = dfb[:,2]
train_users = train_users.drop(['date_first_booking'], axis=1)

test_users.loc[ test_users.date_first_booking.isnull(), 'date_first_booking'] = '0-0-0'
dfb = np.vstack(test_users.date_first_booking.astype(str).apply(
        lambda x: list(map(int, x.split('-')))).values)
test_users['dfb_year'] = dfb[:,0]
test_users['dfb_month'] = dfb[:,1]
test_users['dfb_day'] = dfb[:,2]
test_users = test_users.drop(['date_first_booking'], axis=1)

In [37]:
country_destination_encoding = {'NDF': 0,
'US' : 1,
'other' : 2,
'FR' : 3,
'IT' : 4,
'GB' : 5,
'ES' : 6,
'CA' : 7,
'DE' : 8,
'NL' : 9,
'AU' : 10,
'PT' : 11}

train_users_labels = train_users_labels.apply(lambda x: country_destination_encoding[x])

In [39]:
train_users['country_destination'] = train_users_labels.values

In [49]:
train_users=train_users.drop(['id'], axis=1)

In [50]:
train_users.to_csv('preprocessed_airbnb_train.csv',index=False)
test_users.to_csv('preprocessed_airbnb_test.csv', index=False)

In [51]:
print train_users.shape
print test_users.shape

(213451, 22)
(62096, 22)
