In [6]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()

dir='/Users/xinwang/ai/dataset/kaggle/GStore/'
train_file = 'train.csv'
test_file = 'test.csv'

train = pd.read_csv(dir + train_file, low_memory=False)
test = pd.read_csv(dir + test_file, low_memory=False)


print('train.shape',train.shape)
print('test.shape',test.shape)

train.shape (903653, 12)
test.shape (804684, 12)


In [305]:
print(len(test['fullVisitorId'].unique()))

617242


In [11]:
train['datetime'] = pd.to_datetime(train['date'], format='%Y%m%d',errors='ignore')

train['datetime'].days

AttributeError: 'Series' object has no attribute 'days'

In [5]:

cate_features = []
numeric_features = []

train['fullVisitorId'] = train['fullVisitorId'].astype(str)
test['fullVisitorId'] = test['fullVisitorId'].astype(str)

train['channelGrouping'] = label.fit_transform(train['channelGrouping'])
test['channelGrouping'] = label.fit_transform(test['channelGrouping'])

cate_features.append('channelGrouping')

def label_transform(df, col_list):
    for col in col_list:
        df[col] = label.fit_transform(df[col])

    return df

####################################
date_features = ['date','year','month','day','week','weekofyear','dayofweek','quarter','month_start','month_end']

def process_date(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d',errors='ignore')
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    df['week'] = df['date'].dt.week
    df['weekofyear'] = df['date'].dt.weekofyear
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month_start'] = df['date'].dt.is_month_start
    df['month_end'] = df['date'].dt.is_month_end

    df = label_transform(df, date_features)
    
    return df

train = process_date(train)
test = process_date(test)
cate_features += date_features


AttributeError: Can only use .dt accessor with datetimelike values

In [None]:


################device####################
device_features = ['browser','operatingSystem','isMobile','deviceCategory']

def process_device(df):
    df['browser'] = df['device'].apply(lambda x: json.loads(x)['browser'])
    df['operatingSystem'] = df['device'].apply(lambda x: json.loads(x)['operatingSystem'])
    df['isMobile'] = df['device'].apply(lambda x: json.loads(x)['isMobile'])
    df['deviceCategory'] = df['device'].apply(lambda x: json.loads(x)['deviceCategory'])

    df = label_transform(df, device_features)

    return df

train = process_device(train)
test = process_device(test)
cate_features += device_features



###############geoNetwork#####################
geo_features = ['continent','subContinent','country','region','metro','city','networkDomain']

def process_geo(df):
    df['continent'] = df['geoNetwork'].apply(lambda x: json.loads(x)['continent'])
    df['subContinent'] = df['geoNetwork'].apply(lambda x: json.loads(x)['subContinent'])
    df['country'] = df['geoNetwork'].apply(lambda x: json.loads(x)['country'])
    df['region'] = df['geoNetwork'].apply(lambda x: json.loads(x)['region'])
    df['metro'] = df['geoNetwork'].apply(lambda x: json.loads(x)['metro'])
    df['city'] = df['geoNetwork'].apply(lambda x: json.loads(x)['city'])
    df['networkDomain'] = df['geoNetwork'].apply(lambda x: json.loads(x)['networkDomain'])

    df = label_transform(df, geo_features)

    return df

train = process_geo(train)
test = process_geo(test)
cate_features += geo_features



################totals####################
view_features = ['hits','pageviews','newVisits','bounces','visitNumber']

def process_totals(df):
    df['hits'] = df['totals'].apply(lambda x: json.loads(x)['hits']).astype(int)
    df['pageviews'] = df['totals'].apply(lambda x: json.loads(x)['pageviews'] if x.find('pageviews')>=0 else 0).astype(int)
    df['bounces'] = df['totals'].apply(lambda x: json.loads(x)['bounces'] if x.find('bounces')>=0 else 0).astype(int)
    df['newVisits'] = df['totals'].apply(lambda x: json.loads(x)['newVisits'] if x.find('newVisits')>=0 else 0).astype(int)
    df['transactionRevenue'] = df['totals'].apply(lambda x: json.loads(x)['transactionRevenue'] if x.find('transactionRevenue')>=0 else 0).astype(int)

    return df

train = process_totals(train)
test = process_totals(test)
numeric_features += view_features



################totals####################
last_time_features = ['last_seconds','last_minutes']

def process_last_time(df):
    df['last_seconds'] = df['visitStartTime']-df['visitId']
    df['last_minutes'] = (df['visitStartTime']-df['visitId'])/60
    df['last_minutes'] = df['last_minutes'].astype(np.int64)

    return df

train = process_last_time(train)
test = process_last_time(test)
numeric_features += last_time_features


################Multip transaction####################
def process_transaction_count(df):
    transaction_df = train[['fullVisitorId','transactionRevenue']].groupby('fullVisitorId')
        
    transac_count_df = transaction_df.count()
    transac_count_df['fullVisitorId'] = transac_count_df.index
    transac_count_df.rename(columns={
        'transactionRevenue':'transaction_count'
    }, inplace=True)
    transac_count_df['transaction_count'] = transac_count_df['transaction_count'].astype(int)

    df = pd.merge(df, transac_count_df, on='fullVisitorId')

    return df

train = process_transaction_count(train)
test = process_transaction_count(test)
numeric_features.append('transaction_count')

def process_buy_times(df):
    transaction_df = df[['fullVisitorId','transactionRevenue']].groupby('fullVisitorId')

    def count_buy_times(x):
        buy_times = sum(x['transactionRevenue']>0)

        return buy_times

    buy_times_group = transaction_df.apply(count_buy_times)
    buy_times_df = pd.DataFrame({
        'fullVisitorId':buy_times_group.index,
        'buy_times':buy_times_group.values
    })

    df = pd.merge(df, buy_times_df, on='fullVisitorId')

    return df

train = process_buy_times(train)
test = process_buy_times(test)
numeric_features.append('buy_times')


def parse_adwordsClickInfo_field(x, field):
    jo = json.loads(x)
    
    if x.find('adwordsClickInfo')>=0:
        adwordsClickInfo = jo['adwordsClickInfo']
        
        if str(adwordsClickInfo).find(field)>=0:
            return adwordsClickInfo[field]

    return 0

def parse_adwordsClickInfo_page(x):
    return parse_adwordsClickInfo_field(x, 'page')

def parse_adwordsClickInfo_slot(x):
    return parse_adwordsClickInfo_field(x, 'slot')

def parse_adwordsClickInfo_gclId(x):
    return parse_adwordsClickInfo_field(x, 'gclId')

def parse_adwordsClickInfo_adNetworkType(x):
    return parse_adwordsClickInfo_field(x, 'adNetworkType')

def parse_adwordsClickInfo_isVideoAd(x):
    return parse_adwordsClickInfo_field(x, 'isVideoAd')

traffic_features = ['campaign','source','medium','keyword','adwordsClickInfo_gclId_prefix','adwordsClickInfo_slot',
                    'adwordsClickInfo_gclId','adwordsClickInfo_adNetworkType']

def process_traffic(df):
    df['campaign'] = df['trafficSource'].apply(lambda x: json.loads(x)['campaign']).astype(str)
    # need to merge nearly same record
    df['source'] = df['trafficSource'].apply(lambda x: json.loads(x)['source']).astype(str)
    df['medium'] = df['trafficSource'].apply(lambda x: json.loads(x)['medium']).astype(str)
    # need to merge some keywords
    df['keyword'] = df['trafficSource'].apply(lambda x: json.loads(x)['keyword'] if x.find('keyword')>=0 else 0).astype(str)

    df['adwordsClickInfo_page'] = df['trafficSource'].apply(parse_adwordsClickInfo_page).astype(int)
    df['adwordsClickInfo_slot'] = df['trafficSource'].apply(parse_adwordsClickInfo_slot).astype(str)
    df['adwordsClickInfo_gclId'] = df['trafficSource'].apply(parse_adwordsClickInfo_gclId).astype(str)
    df['adwordsClickInfo_gclId_prefix'] = df['adwordsClickInfo_gclId'].apply(lambda x: x.split('_')[0] if type(x)!=int and x.find('_')>=0 else 0).astype(str)
    df['adwordsClickInfo_adNetworkType'] = df['trafficSource'].apply(parse_adwordsClickInfo_adNetworkType).astype(str)

    df = label_transform(df, traffic_features)
    
    return df

train = process_traffic(train)
test = process_traffic(test)

cate_features += traffic_features
numeric_features.append('adwordsClickInfo_page')



###################################################### 
target = 'revenue'

def process_revenue(df):
    revenue_df = df[['fullVisitorId','transactionRevenue']].groupby('fullVisitorId').agg('sum')
    revenue_df['fullVisitorId'] = revenue_df.index
    revenue_df[target] = revenue_df['transactionRevenue'].apply(lambda x: np.log(x+1))
    
    revenue_df.drop('transactionRevenue', axis=1, inplace=True)
    
    df = pd.merge(df, revenue_df, on='fullVisitorId')

    return df


train = process_revenue(train)
test = process_revenue(test)

removed_columns = ['device','geoNetwork','socialEngagementType','totals','trafficSource']
train.drop(removed_columns, axis=1, inplace=True)
train.columns


print(len(train['fullVisitorId'].unique()))

In [302]:
d_rows = train[train['fullVisitorId'].duplicated(keep=False)]


def merge_data(x):
    mode_array = x.mode().values
    
    return mode_array[0]

duplicated_ids = d_rows['fullVisitorId'].unique()
print('duplicated id size',len(duplicated_ids))
train_copy = train.copy()

duplicated_df = pd.DataFrame({
    'fullVisitorId':duplicated_ids
})

temp = train_copy.apply(lambda x: print(x.name), axis=0)

temp.head()

# for id in unique_ids:
#     temp_mode = train[train['fullVisitorId'] == id].apply(merge_data, axis=0)
    
#     train = train.append(temp_mode,ignore_index=True)

    
# train.drop(d_rows.index, inplace=True)

# print('train.shape',train.shape)
# print('d_rows.shape',d_rows.shape)

# print(len(d_rows['fullVisitorId'].unique()))

duplicated id size 609
channelGrouping
date
fullVisitorId
sessionId
visitId
visitNumber
visitStartTime
year
month
day
week
weekofyear
dayofweek
quarter
month_start
month_end
browser
operatingSystem
isMobile
deviceCategory
continent
subContinent
country
region
metro
city
networkDomain
hits
pageviews
bounces
newVisits
transactionRevenue
last_seconds
last_minutes
transaction_count
buy_times
campaign
source
medium
keyword
adwordsClickInfo_page
adwordsClickInfo_slot
adwordsClickInfo_gclId
adwordsClickInfo_gclId_prefix
adwordsClickInfo_adNetworkType
revenue


channelGrouping    None
date               None
fullVisitorId      None
sessionId          None
visitId            None
dtype: object

In [291]:
train = pd.read_csv(dir + train_file, low_memory=False)
test = pd.read_csv(dir + test_file, low_memory=False)


def process_totals(df):
    df['hits'] = df['totals'].apply(lambda x: json.loads(x)['hits']).astype(int)
    df['pageviews'] = df['totals'].apply(lambda x: json.loads(x)['pageviews'] if x.find('pageviews')>=0 else 0).astype(int)
    df['bounces'] = df['totals'].apply(lambda x: json.loads(x)['bounces'] if x.find('bounces')>=0 else 0).astype(int)
    df['newVisits'] = df['totals'].apply(lambda x: json.loads(x)['newVisits'] if x.find('newVisits')>=0 else 0).astype(int)
    df['transactionRevenue'] = df['totals'].apply(lambda x: json.loads(x)['transactionRevenue'] if x.find('transactionRevenue')>=0 else 0).astype(int)

    return df

train = process_totals(train)


target = 'revenue'

def process_revenue(df):
    revenue_df = df[['fullVisitorId','transactionRevenue']].groupby('fullVisitorId').agg('sum')
    revenue_df['fullVisitorId'] = revenue_df.index
    revenue_df[target] = revenue_df['transactionRevenue'].apply(lambda x: np.log(x+1))
    
    revenue_df.drop('transactionRevenue', axis=1, inplace=True)
    
    df = pd.merge(df, revenue_df, on='fullVisitorId')

    return df


train = process_revenue(train)

transactionRevenue = train[target].unique()

print(transactionRevenue)

[ 0.         17.97189053 17.08317659 17.44940573 19.54128281 18.03545936
 17.37314175 20.16840125 15.94135859 19.79624272 16.99689222 17.38447974
 18.21611359 20.11871544 18.58058627 17.59308749 19.64142217 17.35589003
 17.38193988 19.64159916 17.33211782 20.26992265 18.45256694 19.53736214
 15.45450747 18.32472036 16.17542081 17.06122308 19.28268423 16.94969962
 20.20223835 17.98527177 17.61749548 17.2296242  16.21249642 17.03398634
 16.6469577  16.01051055 17.2404245  19.27917286 16.29707839 17.86851217
 19.00378982 18.20699236 19.37376618 18.48139949 17.46226511 17.7963131
 17.2806213  18.3878476  19.39444762 18.47061312 20.79744524 15.75999126
 19.63056604 19.51515114 17.117992   17.3412827  17.72753358 18.37954621
 18.50823365 19.52045902 17.66926868 16.63629409 18.5474896  15.82774348
 16.40252251 18.1974122  18.12751702 17.86868586 16.42484486 16.11609375
 17.26518062 16.86098497 18.52648117 18.97387597 16.64813555 17.45204356
 19.31284271 17.57601278 16.68284117 16.81074276 15.

Defaulting to column, but this will raise an ambiguity error in a future version


In [290]:

print('transactionRevenue.mean',transactionRevenue.mean())

transactionRevenue.mean 17.629718972025067


In [None]:
# groupby works well 
from tqdm import tqdm

print('unique fullVisitorId size ',len(train['fullVisitorId'].unique()))

d_rows = train[train['fullVisitorId'].duplicated(keep=False)]
revisted_df = train.loc[d_rows.index]


def merge_func(df):
    def merge_one_column(x):
        new_merged_value=x.mode().values[0]
        return new_merged_value
    
    temp = df.apply(merge_one_column)
    return temp

tqdm.pandas()

merged_df = revisted_df.loc[0:100].groupby('fullVisitorId').progress_apply(merge_func)
merged_df['fullVisitorId'] = merged_df.index

print('*'*80)
print(merged_df.shape)
merged_df.head(20)

In [None]:
# Another solution of groupby mode 
# works well
from tqdm import tqdm

def merge_func(group):
    new_value = group.mode().values[0]
    print('',group,group.name,group.values,new_value)
    print('-'*50)
    
    return new_value

tqdm.pandas()

merged_df = revisted_df.loc[0:10].groupby('fullVisitorId').agg(merge_func)
merged_df['fullVisitorId'] = merged_df.index

print(merged_df.shape)

In [317]:
from joblib import Parallel, delayed
from multiprocessing import Pool, cpu_count
import tqdm
import time


def applyParallel(dfGrouped, func):
    with Pool(cpu_count()) as p:
        ret_list = list(tqdm.tqdm(
            p.imap(func, [group for name, group in dfGrouped]), 
            total=10))

    df = pd.concat(ret_list, axis=1)
    df = df.T
    return df


def merge_group_func(df):
    def merge_one_column(x):
        new_merged_value=x.mode().values[0]
        return new_merged_value

    temp = df.apply(merge_one_column)

    return temp

def merge_duplicated_into_new_row(df):
    print()
    print('unique fullVisitorId size ',df.shape, len(df['fullVisitorId'].unique()))

    d_rows = df[df['fullVisitorId'].duplicated(keep=False)]
    revisted_df = df.loc[d_rows.index]
    print('revisted_df.shape',revisted_df.shape)

    new_merged_df = applyParallel(revisted_df.groupby('fullVisitorId'),merge_group_func)
    print('new_merged_df.shape',new_merged_df.shape)

    df.drop(d_rows.index, inplace=True)
    
    df = pd.concat([df,new_merged_df], axis=0)
    del d_rows
    del new_merged_df
    
    print('df.shape',df.shape)

    return df
    

train = merge_duplicated_into_new_row(train)
print('merge_duplicated_into_new_row done')

print()
print('train.shape', train.shape)
train.head()


unique fullVisitorId size  (9999, 12) 9250
revisted_df.shape (1358, 12)



  0%|          | 0/10 [00:00<?, ?it/s][A
104it [00:00, 980.97it/s]             [A
202it [00:00, 993.60it/s][A
304it [00:00, 991.00it/s][A
401it [00:00, 985.50it/s][A
508it [00:00, 993.78it/s][A
609it [00:00, 1013.71it/s][A

new_merged_df.shape (609, 12)
df.shape (9250, 12)
merge_duplicated_into_new_row done

train.shape (9250, 12)


Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",1131660440785968503,"{""continent"": ""Asia"", ""subContinent"": ""Western Asia"", ""country"": ""Turkey"", ""region"": ""Izmir"", ""metro"": ""(not set)"", ""city"": ""Izmir"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""ttnet.com.tr"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",1131660440785968503_1472830385,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472830385,1,1472830385
1,Organic Search,20160902,"{""browser"": ""Firefox"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",377306020877927890,"{""continent"": ""Oceania"", ""subContinent"": ""Australasia"", ""country"": ""Australia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""dodo.net.au"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",377306020877927890_1472880147,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472880147,1,1472880147
2,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Windows"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",3895546263509774583,"{""continent"": ""Europe"", ""subContinent"": ""Southern Europe"", ""country"": ""Spain"", ""region"": ""Community of Madrid"", ""metro"": ""(not set)"", ""city"": ""Madrid"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",3895546263509774583_1472865386,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472865386,1,1472865386
3,Organic Search,20160902,"{""browser"": ""UC Browser"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Linux"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": false, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""desktop""}",4763447161404445595,"{""continent"": ""Asia"", ""subContinent"": ""Southeast Asia"", ""country"": ""Indonesia"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",4763447161404445595_1472881213,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1"", ""newVisits"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""google + online"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}}",1472881213,1,1472881213
4,Organic Search,20160902,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Android"", ""operatingSystemVersion"": ""not available in demo dataset"", ""isMobile"": true, ""mobileDeviceBranding"": ""not available in demo dataset"", ""mobileDeviceModel"": ""not available in demo dataset"", ""mobileInputSelector"": ""not available in demo dataset"", ""mobileDeviceInfo"": ""not available in demo dataset"", ""mobileDeviceMarketingName"": ""not available in demo dataset"", ""flashVersion"": ""not available in demo dataset"", ""language"": ""not available in demo dataset"", ""screenColors"": ""not available in demo dataset"", ""screenResolution"": ""not available in demo dataset"", ""deviceCategory"": ""mobile""}",27294437909732085,"{""continent"": ""Europe"", ""subContinent"": ""Northern Europe"", ""country"": ""United Kingdom"", ""region"": ""not available in demo dataset"", ""metro"": ""not available in demo dataset"", ""city"": ""not available in demo dataset"", ""cityId"": ""not available in demo dataset"", ""networkDomain"": ""unknown.unknown"", ""latitude"": ""not available in demo dataset"", ""longitude"": ""not available in demo dataset"", ""networkLocation"": ""not available in demo dataset""}",27294437909732085_1472822600,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"", ""bounces"": ""1""}","{""campaign"": ""(not set)"", ""source"": ""google"", ""medium"": ""organic"", ""keyword"": ""(not provided)"", ""adwordsClickInfo"": {""criteriaParameters"": ""not available in demo dataset""}, ""isTrueDirect"": true}",1472822600,2,1472822600


In [None]:
transaction_value_counts_df = pd.DataFrame({
    'count':train['transaction_count'].value_counts().values,
    'value':train['transaction_count'].value_counts().index,
    'percent': 100.0*(train['transaction_count'].value_counts().values/train['transaction_count'].count())
})

transaction_value_counts_df.head(20)