In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import datetime
import seaborn as sns
%matplotlib inline

In [4]:
# Read the train 
train_users = pd.read_csv('../all/train_users_2.csv', parse_dates=[1, 2, 3])

In [5]:
display("%d users in the training set." % train_users.shape[0])
display("Number of Nulls in each columns:")
display(train_users.isnull().sum())
display("Describe :")
display(train_users.describe())

#Print data
print("gender : {}\n".format(train_users.gender.unique()))
print("signup_method : {}\n".format(train_users.signup_method.unique()))
print("language : {}\n".format(train_users.language.unique()))
print("affiliate_provider : {}\n".format(train_users.affiliate_provider.unique()))
print("affiliate_channel : {}\n".format(train_users.affiliate_channel.unique()))
print("first_affiliate_tracked : {}\n".format(train_users.first_affiliate_tracked.unique()))
print("signup_app : {}\n".format(train_users.signup_app.unique()))
print("first_device_type : {}\n".format(train_users.first_device_type.unique()))
print("first_browser : {}\n".format(train_users.first_browser.unique()))
print("country_destination : {}\n".format(train_users.country_destination.unique()))

'213451 users in the training set.'

'Number of Nulls in each columns:'

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

'Describe :'

Unnamed: 0,age,signup_flow
count,125461.0,213451.0
mean,49.668335,3.267387
std,155.666612,7.637707
min,1.0,0.0
25%,28.0,0.0
50%,34.0,0.0
75%,43.0,0.0
max,2014.0,25.0


gender : ['-unknown-' 'MALE' 'FEMALE' 'OTHER']

signup_method : ['facebook' 'basic' 'google']

language : ['en' 'fr' 'de' 'es' 'it' 'pt' 'zh' 'ko' 'ja' 'ru' 'pl' 'el' 'sv' 'nl'
 'hu' 'da' 'id' 'fi' 'no' 'tr' 'th' 'cs' 'hr' 'ca' 'is']

affiliate_provider : ['direct' 'google' 'other' 'craigslist' 'facebook' 'vast' 'bing' 'meetup'
 'facebook-open-graph' 'email-marketing' 'yahoo' 'padmapper' 'gsp' 'wayn'
 'naver' 'baidu' 'yandex' 'daum']

affiliate_channel : ['direct' 'seo' 'other' 'sem-non-brand' 'content' 'sem-brand'
 'remarketing' 'api']

first_affiliate_tracked : ['untracked' 'omg' nan 'linked' 'tracked-other' 'product' 'marketing'
 'local ops']

signup_app : ['Web' 'Moweb' 'iOS' 'Android']

first_device_type : ['Mac Desktop' 'Windows Desktop' 'iPhone' 'Other/Unknown'
 'Desktop (Other)' 'Android Tablet' 'iPad' 'Android Phone'
 'SmartPhone (Other)']

first_browser : ['Chrome' 'IE' 'Firefox' 'Safari' '-unknown-' 'Mobile Safari'
 'Chrome Mobile' 'RockMelt' 'Chromium' 'Android Browser' 'AO

In [6]:
#date_account_created  and timestamp_first_active

START_DATE = datetime.datetime(2008, 1, 1)
print("Correlation : ",((train_users['date_account_created'] - START_DATE).dt.days).corr((train_users['timestamp_first_active'] - START_DATE).dt.days))

#Due to High Correlation only 1 column is used.

train_users['dac_year'] =train_users['date_account_created'].dt.year
train_users['dac_month'] =train_users['date_account_created'].dt.month
train_users['dac_day'] =train_users['date_account_created'].dt.day

train_users['tfa_hour'] =train_users['timestamp_first_active'].dt.hour

Correlation :  0.9992703793478075


In [7]:
# Set Missing and invalid values to -1
train_users['age'].fillna(-1, inplace=True)
train_users['age'] = np.where(train_users['age'] < 14, -1, train_users['age'])
train_users['age'] = np.where(train_users['age'] > 100, -1,train_users['age'])

In [8]:
# Check if data is missing value
train_users['Missing_Age_gen'] = np.sum([(train_users['age'] == -1),
                            (train_users['gender'] == '-unknown-'),], axis=0)

In [9]:
#LabelEncoder
labelCol = ['language','affiliate_provider','first_browser']
le = preprocessing.LabelEncoder()

for col in labelCol:
    le.fit(train_users[col].unique())
    train_users[col] = le.transform(train_users[col].values)

In [10]:
# Do one-hot encoding for all categorical variable

New_Combined_columns = ["gender","signup_method","affiliate_channel",'first_affiliate_tracked','signup_app','first_device_type']
train_users = pd.get_dummies(train_users, prefix=New_Combined_columns, columns=New_Combined_columns)

In [11]:
remove_columns = ['date_account_created','timestamp_first_active','date_first_booking']
train_users.drop(remove_columns,axis = 'columns',inplace=True)

In [12]:
train_users.head()

Unnamed: 0,id,age,signup_flow,language,affiliate_provider,first_browser,country_destination,dac_year,dac_month,dac_day,...,signup_app_iOS,first_device_type_Android Phone,first_device_type_Android Tablet,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone
0,gxn3p5htnn,-1.0,0,5,4,8,NDF,2010,6,28,...,0,0,0,0,1,0,0,0,0,0
1,820tgsjxq7,38.0,0,5,8,8,NDF,2011,5,25,...,0,0,0,0,1,0,0,0,0,0
2,4ft3gnwmtx,56.0,3,5,4,21,US,2010,9,28,...,0,0,0,0,0,0,0,1,0,0
3,bjjt8pjhuk,42.0,0,5,4,17,other,2011,12,5,...,0,0,0,0,1,0,0,0,0,0
4,87mebub9p4,41.0,0,5,4,8,US,2010,9,14,...,0,0,0,0,1,0,0,0,0,0


In [13]:
train_users.to_csv('../tempData/Processed_Train_User.csv',index = False)