In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import gc
import sys

from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing

import os
print(os.listdir("../input"))

In [None]:
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
%%time
train = load_df('../input/train.csv')
test = load_df('../input/test.csv')

print('train date:', min(train['date']), 'to', max(train['date']))
print('test date:', min(test['date']), 'to', max(test['date']))

In [None]:
# only train feature
for c in train.columns.values:
    if c not in test.columns.values: print(c)

In [None]:
# totals, the sub-column transactionRevenue contains the revenue information we are trying to predict
train_rev = train[~train['totals.transactionRevenue'].isnull()].copy()
print(len(train_rev))
train_rev.head()

In [None]:
train['totals.transactionRevenue'].fillna(0, inplace=True)
train['totals.transactionRevenue'] = np.log1p(train['totals.transactionRevenue'].astype(float))
print(train['totals.transactionRevenue'].describe())

# 

In [None]:
all_data = train.append(test, sort=False).reset_index(drop=True)

In [None]:
print(all_data.info())

In [None]:
null_cnt = train.isnull().sum().sort_values()
print(null_cnt[null_cnt > 0])

In [None]:
# fillna numeric feature
all_data['totals.pageviews'].fillna(1, inplace=True)
all_data['totals.newVisits'].fillna(0, inplace=True)
all_data['totals.bounces'].fillna(0, inplace=True)
all_data['totals.pageviews'] = all_data['totals.pageviews'].astype(int)
all_data['totals.newVisits'] = all_data['totals.newVisits'].astype(int)
all_data['totals.bounces'] = all_data['totals.bounces'].astype(int)

# fillna boolean feature
all_data['trafficSource.isTrueDirect'].fillna(False, inplace=True)

In [None]:
# drop constant column
constant_column = [col for col in all_data.columns if all_data[col].nunique() == 1]
#for c in constant_column:
#    print(c + ':', train[c].unique())

print('drop columns:', constant_column)
all_data.drop(constant_column, axis=1, inplace=True)

## date

In [None]:
format_str = '%Y%m%d'
all_data['formated_date'] = all_data['date'].apply(lambda x: datetime.strptime(str(x), format_str))
all_data['_month'] = all_data['formated_date'].apply(lambda x:x.month)
all_data['_quarterMonth'] = all_data['formated_date'].apply(lambda x:x.day//8)
all_data['_day'] = all_data['formated_date'].apply(lambda x:x.day)
all_data['_weekday'] = all_data['formated_date'].apply(lambda x:x.weekday())

all_data.drop(['date','formated_date'], axis=1, inplace=True)

## channelGrouping
* The channel via which the user came to the Store.

In [None]:
print(all_data['channelGrouping'].value_counts())
#print('-' * 30)
#print(train_rev['channelGrouping'].value_counts())

## fullVisitorId
* A unique identifier for each user of the Google Merchandise Store.

## visitId
* An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user.   
For a completely unique ID, you should use a combination of fullVisitorId and visitId.

## newVisits


In [None]:
print('train all:', len(train))
print('train unique fullVisitorId:', train['fullVisitorId'].nunique())
print('train unique visitId:', train['visitId'].nunique())
print('-' * 30)
print('test all:', len(test))
print('test unique fullVisitorId:', test['fullVisitorId'].nunique())
print('test unique visitId:', test['visitId'].nunique())
#print('common fullVisitorId:', len(pd.merge(train, test, how='inner', on='fullVisitorId'))) # 183434

In [None]:
print(all_data['visitNumber'].value_counts()[:5])
print('-' * 30)
print(all_data['totals.newVisits'].value_counts())
print('-' * 30)
print(all_data['totals.bounces'].value_counts())

In [None]:
#maxVisitNumber = max(all_data['visitNumber'])
#fvid = all_data[all_data['visitNumber'] == maxVisitNumber]['fullVisitorId']
#all_data[all_data['fullVisitorId'] == fvid.values[0]].sort_values(by='visitNumber')

In [None]:
all_data['_visitStartHour'] = all_data['visitStartTime'].apply(
    lambda x: str(datetime.fromtimestamp(x).hour))

## sessionId
*  A unique identifier for this visit to the store.

In [None]:
print('train all sessionId:', len(train['sessionId']))
print('train unique sessionId:', train['sessionId'].nunique())

## socialEngagementType
* Engagement type, either "Socially Engaged" or "Not Socially Engaged".

In [None]:
#all_data['socialEngagementType'].value_counts()

## device

In [None]:
print('unique browser count:', train['device.browser'].nunique())
print('-' * 30)
print(all_data['device.browser'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['device.browser'].value_counts()[:10])

In [None]:
pd.crosstab(all_data['device.deviceCategory'], all_data['device.isMobile'], margins=False)

In [None]:
#pd.crosstab(train_rev['device.deviceCategory'], train_rev['device.isMobile'], margins=False)

#all_data['isMobile'] = True
#all_data.loc[all_data['deviceCategory'] == 'desktop', 'isMobile'] = False

In [None]:
print('unique operatingSystem count:', train['device.operatingSystem'].nunique())
print('-' * 30)
print(all_data['device.operatingSystem'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['device.operatingSystem'].value_counts()[:10])

## geoNetwork

In [None]:
print(all_data['geoNetwork.city'].value_counts()[:10])
print('-' * 30)
print(all_data['geoNetwork.region'].value_counts()[:10])
print('-' * 30)
print(all_data['geoNetwork.subContinent'].value_counts()[:10])
print('-' * 30)
print(all_data['geoNetwork.continent'].value_counts())

#a = all_data[all_data['continent'] == '(not set)']
#a[a['city'] != '(not set)'][['city', 'region']]

In [None]:
print(all_data['geoNetwork.metro'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['geoNetwork.metro'].value_counts()[:10])

In [None]:
print(all_data['geoNetwork.networkDomain'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['geoNetwork.networkDomain'].value_counts()[:10])

## totals

In [None]:
print(all_data['totals.hits'].value_counts()[:10])

all_data['totals.hits'] = all_data['totals.hits'].astype(int)
all_data['_meanHitsPerDay'] = all_data.groupby(['_day'])['totals.hits'].transform('mean')
all_data['_meanHitsPerWeekday'] = all_data.groupby(['_weekday'])['totals.hits'].transform('mean')
all_data['_meanHitsPerMonth'] = all_data.groupby(['_month'])['totals.hits'].transform('mean')
all_data['_sumHitsPerDay'] = all_data.groupby(['_day'])['totals.hits'].transform('sum')
all_data['_sumHitsPerWeekday'] = all_data.groupby(['_weekday'])['totals.hits'].transform('sum')
all_data['_sumHitsPerMonth'] = all_data.groupby(['_month'])['totals.hits'].transform('sum')

In [None]:
print(all_data['totals.pageviews'].value_counts()[:10])

all_data['totals.pageviews'] = all_data['totals.pageviews'].astype(int)

In [None]:
#print(all_data['totals.visits'].value_counts())

## trafficSource

In [None]:
print(all_data['trafficSource.adContent'].value_counts()[:10])
print('-' * 30)
print(train_rev['trafficSource.adContent'].value_counts())

all_data['_adContentGMC'] = (all_data['trafficSource.adContent'] == 'Google Merchandise Collection').astype(np.uint8)

In [None]:
print(all_data['trafficSource.campaign'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['trafficSource.campaign'].value_counts()[:10])

all_data['_withCampaign'] = (all_data['trafficSource.campaign'] != '(not set)').astype(np.uint8)

In [None]:
#print(all_data['campaignCode'].value_counts())

In [None]:
print(all_data['trafficSource.isTrueDirect'].value_counts())

In [None]:
print(all_data['trafficSource.keyword'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['trafficSource.keyword'].value_counts()[:10])

In [None]:
print(all_data['trafficSource.medium'].value_counts())
print('-' * 30)
print(train_rev['trafficSource.medium'].value_counts())

In [None]:
print(all_data['trafficSource.referralPath'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['trafficSource.referralPath'].value_counts()[:10])

In [None]:
print(all_data['trafficSource.source'].value_counts()[:10])
#print('-' * 30)
#print(train_rev['trafficSource.source'].value_counts()[:10])

all_data['_sourceGpmall'] = (all_data['trafficSource.source'] == 'mall.googleplex.com').astype(np.uint8)

##

In [None]:
_='''
train_rev = train_rev.sort_values(['visitStartTime']).reset_index()
train_rev['_buyCount'] = train_rev.groupby('fullVisitorId').cumcount() + 1
all_data = pd.merge(all_data, train_rev[['_buyCount','fullVisitorId','visitId']], 
                    on=['fullVisitorId','visitId'], how='left')
for fvId in train_rev['fullVisitorId'].unique():
    visitor_data = all_data[all_data['fullVisitorId'] == fvId].sort_values(['visitStartTime'])['_buyCount'].reset_index()
    all_data.loc[all_data['fullVisitorId'] == fvId, '_buyCount'] = visitor_data['_buyCount'].fillna(method='ffill').values
all_data['_buyCount'].fillna(0, inplace=True)
all_data['_buyRate'] = all_data['_buyCount'] / all_data['visitNumber']
'''

In [None]:
#all_data[all_data['fullVisitorId'] == '7813149961404844386']

##

In [None]:
null_cnt = all_data.isnull().sum().sort_values()
print(null_cnt[null_cnt > 0])

In [None]:
_='''
all_data.drop([
    'sessionId','visitId','visitStartTime',
    'trafficSource.adwordsClickInfo.adNetworkType',
    'trafficSource.adwordsClickInfo.gclId',
    'trafficSource.adwordsClickInfo.page',
    'trafficSource.adwordsClickInfo.slot'],axis=1,inplace=True)

for i, t in all_data.loc[:, all_data.columns != 'fullVisitorId'].dtypes.iteritems():
    if t == object:
        #all_data = pd.concat([all_data, pd.get_dummies(all_data[i].astype(str), prefix=i)], axis=1)
        #all_data.drop(i, axis=1, inplace=True)
        all_data[i].fillna('unknown', inplace=True)
        all_data[i] = pd.factorize(all_data[i])[0]
        #all_data[i] = all_data[i].astype('category')
'''

In [None]:
c = ['fullVisitorId',
     'visitNumber',
     'channelGrouping',
     'device.deviceCategory',
     'device.operatingSystem',
     'geoNetwork.subContinent',
     'totals.transactionRevenue',
     'totals.newVisits',
     'totals.hits',
     'totals.pageviews',
     'trafficSource.medium',
     'trafficSource.isTrueDirect',
     '_month',
     '_quarterMonth',
     '_weekday',
     '_visitStartHour',
     '_meanHitsPerDay','_meanHitsPerWeekday','_meanHitsPerMonth',
     '_sumHitsPerDay','_sumHitsPerWeekday','_sumHitsPerMonth',
     #'_buyCount',
     #'_buyRate'
     '_adContentGMC',
     '_withCampaign',
     '_sourceGpmall']
all_data = all_data[c]

for i, t in all_data.loc[:, all_data.columns != 'fullVisitorId'].dtypes.iteritems():
    if t == object:
        all_data = pd.concat([all_data, pd.get_dummies(all_data[i].astype(str), prefix=i)], axis=1)
        all_data.drop(i, axis=1, inplace=True)

#

In [None]:
all_data.info()

In [None]:
train = all_data[all_data['totals.transactionRevenue'].notnull()]
test = all_data[all_data['totals.transactionRevenue'].isnull()].drop(['totals.transactionRevenue'], axis=1)

In [None]:
train_id = train['fullVisitorId']
test_id = test['fullVisitorId']

Y_train_reg = train.pop('totals.transactionRevenue')
Y_train_cls = (Y_train_reg.fillna(0) > 0).astype(np.uint8)

X_train = train.drop(['fullVisitorId'], axis=1)
X_test  = test.drop(['fullVisitorId'], axis=1)

print(X_train.shape, X_test.shape)

In [None]:
import sys
import gc

del all_data, train, test, train_rev
gc.collect()

print(pd.DataFrame([[val for val in dir()], [sys.getsizeof(eval(val)) for val in dir()]],
                   index=['name','size']).T.sort_values('size', ascending=False).reset_index(drop=True)[:10])

In [None]:
from sklearn import ensemble, metrics

In [None]:
%%time
reg = ensemble.GradientBoostingRegressor(n_estimators=500, learning_rate=0.1, max_depth=3, verbose=1, random_state=42)
reg.fit(X_train, Y_train_cls)
pred_reg = reg.predict(X_test)

print(len(pred_reg), len(pred_reg[pred_reg > 0.1]))

In [None]:
%%time

reg = ensemble.GradientBoostingRegressor(n_estimators=1000, learning_rate=0.5, max_depth=3, verbose=1, random_state=42)
reg.fit(X_train[Y_train_reg > 0], Y_train_reg[Y_train_reg > 0])

pred = np.zeros(len(pred_reg))
for i in np.arange(len(pred_reg)):
    #if pred_reg[i] >= 0.1:
        pred[i] = reg.predict([X_test.iloc[i]])[0] * pred_reg[i]

In [None]:
submission = pd.DataFrame({'fullVisitorId':test_id, 'PredictedLogRevenue':pred})
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].apply(lambda x : 0.0 if x < 0 else x)
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"].fillna(0.0)
submission_sum = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
submission_sum.to_csv("submission.csv", index=False)
submission_sum[submission_sum['PredictedLogRevenue'] > 0.0]

In [None]:
submission_sum['PredictedLogRevenue'].describe()