In [1]:
import pandas as pd
import json 
import numpy as np
import pylab

In [2]:
"""
A function to flatten the columns
"""
def flatten_all(inputfile, outputfile,
                fcol_to_flat, scol_to_flat, col_to_del):
    """
    Flatten all the columns with dictionaries in.
    Append all useful column at the end of data frame
    Return a data frame and write into csv
    """
    raw = pd.read_csv(inputfile, delimiter=',', skipinitialspace=True)
    # deal the float column
    f_col = raw[fcol_to_flat].apply(json.loads)
    df = raw.join(pd.DataFrame(f_col.tolist()).astype('float'))
    # deal with the string column
    for col in scol_to_flat:
        s_col = raw[col].apply(json.loads)
        df = df.join(pd.DataFrame(s_col.tolist()))
    df = df.drop(col_to_del, axis=1)
    df = df.drop(scol_to_flat, axis=1)
    df = df.drop(fcol_to_flat, axis=1)
    df.to_csv(outputfile, sep=',')
    return df

In [3]:
fcol_to_flat = 'totals'
scol_to_flat = ['device', 'trafficSource', 'geoNetwork']
col_to_del = ['campaignCode', 
              'cityId', 'latitude', 'longitude', 'networkLocation', 
             'browserSize', 'browserVersion', 'flashVersion', 'language',
              'mobileDeviceBranding', 'mobileDeviceInfo', 
              'mobileDeviceMarketingName', 'mobileDeviceModel',
              'mobileInputSelector','operatingSystemVersion',
              'screenColors', 'screenResolution']
inputfile = '../Data/train.csv'
outputfile = '../Data/train_flatten.csv'

In [4]:
df = flatten_all(inputfile, outputfile, 
                 fcol_to_flat, scol_to_flat, col_to_del)

  if (yield from self.run_code(code, result)):


In [5]:
df = df.drop(columns="socialEngagementType")

In [6]:
df['is_transaction'] = 1 - df['transactionRevenue'].isna()

In [7]:
df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)

In [8]:
grouped = df.groupby('is_transaction')
grouped.sum()

Unnamed: 0_level_0,date,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue,visits,isMobile
is_transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,17990748664278,1324821431601705,1999663,1324821432451797,450630.0,3735977.0,698595.0,3152241.0,0.0,892138.0,238103.0
1,232214043878,17109860183354,47018,17109860232366,0.0,417698.0,4465.0,326225.0,1540071000000.0,11515.0,1020.0


## Explore meaningful columns
visitNumber, bounces, hits, pageviews, **isMobile**

In [9]:
## Aggregate
df_yixin = df[['fullVisitorId','is_transaction','bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue', 'visits', 'visitNumber','isMobile']]
grouped_visitor = df_yixin.groupby('fullVisitorId')

In [10]:
grouped_visitor.max()[['is_transaction', 'visitNumber']]

Unnamed: 0_level_0,is_transaction,visitNumber
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1
5103959234087,0,1
10278554503158,0,1
20424342248747,0,1
27376579751715,0,1
33471059618621,0,1
35794135966385,0,1
39460501403861,0,1
40862739425590,0,2
45417921646651,0,1
47810105303746,0,1


In [11]:
grouped_visitor.mean()[['bounces', 'hits','newVisits', 'pageviews','visits','isMobile']]

Unnamed: 0_level_0,bounces,hits,newVisits,pageviews,visits,isMobile
fullVisitorId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5103959234087,0.0,10.0,1.0,8.0,1.0,1.0
10278554503158,0.0,11.0,1.0,8.0,1.0,0.0
20424342248747,0.0,17.0,1.0,13.0,1.0,0.0
27376579751715,0.0,6.0,1.0,5.0,1.0,0.0
33471059618621,1.0,1.0,1.0,1.0,1.0,0.0
35794135966385,1.0,1.0,1.0,1.0,1.0,1.0
39460501403861,0.0,2.0,1.0,2.0,1.0,0.0
40862739425590,0.0,2.5,1.0,2.5,1.0,0.0
45417921646651,0.0,2.0,1.0,2.0,1.0,0.0
47810105303746,1.0,1.0,1.0,1.0,1.0,0.0


In [12]:
df_yixin_agg = grouped_visitor.max()[['is_transaction', 'visitNumber']].join(grouped_visitor.mean()[['bounces', 'hits','newVisits', 'pageviews','visits','isMobile']], on='fullVisitorId')

In [13]:
grouped = df_yixin_agg.groupby('is_transaction')

In [14]:
grouped.mean()

Unnamed: 0_level_0,visitNumber,bounces,hits,newVisits,pageviews,visits,isMobile
is_transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.352033,0.526448,3.723878,1.0,3.189217,1.0,0.268259
1,4.061933,0.070504,26.501574,1.0,20.785063,1.0,0.095408


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_recall_fscore_support

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['visitNumber','bounces','hits','pageviews','isMobile']].fillna(0), df['is_transaction'], test_size=0.33, random_state=42)

In [17]:
X_train.iloc[0:10,:]

Unnamed: 0,visitNumber,bounces,hits,pageviews,isMobile
756972,1,1.0,1.0,1.0,True
899004,1,0.0,8.0,8.0,True
870223,1,1.0,1.0,1.0,True
335571,1,1.0,1.0,1.0,False
631586,1,1.0,1.0,1.0,True
268691,1,0.0,2.0,2.0,True
549083,1,1.0,1.0,1.0,False
833861,1,0.0,18.0,15.0,True
238057,1,1.0,1.0,1.0,False
235172,1,1.0,1.0,1.0,False


In [18]:
y_predicted = estimator.fit_predict(X_train, y_train)

NameError: name 'estimator' is not defined

In [19]:
y_predicted

NameError: name 'y_predicted' is not defined

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['visitNumber','bounces','hits','pageviews','isMobile']].fillna(0), df['is_transaction'], test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')





























  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.00029054600016025905, 0.02051747224741912, 0.000572978111029889, None)

# preprocessing and first fit

In [21]:
def extract_traffic_source_columns(df):
    df['isTrueDirect']=df['isTrueDirect'].fillna(False)
    df['isTrueDirect']=df['isTrueDirect'].astype(int)
    
    is_source_mall_googleplex = (df['source']=='mall.googleplex.com').astype(int)
    is_source_mall_googleplex.name = 'is_source_googleplex'    
    
    is_source_direct = (df['source']=='(direct)').astype(int)
    is_source_direct.name = 'is_source_direct'
    
    is_source_google = (df['source']=='(direct)').astype(int)
    is_source_google.name = 'is_source_google'
    
    is_medium_referral = (df['medium']=='referral').astype(int)
    is_medium_referral.name = 'is_medium_referral'
    is_medium_none = (df['medium']=='(none)').astype(int)
    is_medium_none.name = 'is_medium_none'
    is_medium_organic = (df['medium']=='organic').astype(int)
    is_medium_organic.name="is_medium_organic"
    
    df = df.drop(columns = ['adContent','adwordsClickInfo','campaign', 'keyword', 'medium', 'referralPath',
                      'source'])
    
    return pd.concat([df,is_source_mall_googleplex,is_source_direct,is_source_google,is_medium_referral,
                    is_medium_none,is_medium_organic],axis=1)

In [22]:
def get_region_columns(df):
    df['is_bayarea'] = (df['metro'] =='San Francisco-Oakland-San Jose CA').astype(int)
    dummies = pd.get_dummies(df['continent'])
    dummies = dummies.drop(['(not set)'],axis=1)
    df = df.drop(['continent','city','subContinent','region','country','metro','networkDomain'],axis=1) 
    new_df = pd.concat([df,dummies],axis=1)
    return new_df

In [23]:
def extract_activities(df):
    df['is_transaction'] = 1 - df['transactionRevenue'].isna()
    df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)
    return df
    

In [24]:
def time_value(df):
    import datetime as dt
    time = pd.to_datetime(df.loc[:,'visitStartTime'], unit='s')
    year = time.dt.year
    year = pd.get_dummies(year)
    month = time.dt.month
    q1 = (month <= 3) & (month >= 1)
    q2 = (month <= 6) & (month >= 4)
    q3 = (month <= 9) & (month >= 7)
    q4 = (month <= 12) & (month >= 10)
    dow = time.dt.weekday
    weekday = (dow <= 5) & (dow >=1)
    hour = time.dt.hour
    h_0_6 = (hour <= 6) & (hour >= 0)
    h_7_12 = (hour <= 12) & (hour >= 7)
    h_13_17 = (hour <= 17) & (hour >= 13)
    h_18_23 = (hour <= 23) & (hour >= 18)
    new_df = pd.DataFrame()
    new_df['q1'] = q1.astype(int)
    new_df['q2'] = q2.astype(int)
    new_df['q3'] = q3.astype(int)
    new_df['q4'] = q4.astype(int)
    new_df['weekday'] = weekday.astype(int)
    new_df['h_0_6'] = h_0_6.astype(int)
    new_df['h_7_12'] = h_7_12.astype(int)
    new_df['h_13_17'] = h_13_17.astype(int)
    new_df['h_18_23'] = h_18_23.astype(int)
    return new_df

def device(df):
    browser_chrome = (df['browser']=='Chrome').astype(int)
    is_mobile = df['isMobile'].astype(int)
    system_google = ((df['operatingSystem']=='Android')|(df['operatingSystem']=='Chrome OS')).astype(int)
    new_df = pd.DataFrame()
    new_df['browser_chrome'] = browser_chrome
    new_df['is_mobile'] = is_mobile
    new_df['system_google'] = system_google
    return new_df

In [25]:
def preprocessing(inputfile):
    df = pd.read_csv(inputfile)
    print("flatten done\n"+df.columns)
    df = extract_traffic_source_columns(df)
    print("Congrats Brian\n"+df.columns)
    df = get_region_columns(df)
    print("Congrats Ziyu\n"+df.columns)
    df = extract_activities(df)
    print("Congrats Yixin\n"+df.columns)
    col_time = time_value(df)
    col_device = device(df)
    
    df_base_model = df[['is_transaction','bounces', 'hits', 'pageviews', 'visitNumber',
                       'is_source_googleplex','is_source_direct','is_source_google','is_medium_referral',
                    'is_medium_none','is_medium_organic',
                        'is_bayarea','Africa','Americas','Asia','Europe','Oceania',
                        'next_session_1','last_session_1'
                       ]]
    df_base_model = pd.concat([df_base_model,col_time],axis=1)
    df_base_model = pd.concat([df_base_model,col_device],axis=1)
    print("Congrats Xi\n"+df_base_model.columns)
    return df_base_model

In [26]:
inputfile = '../Data/train_flatten_with_ts.csv'
df_base_model = preprocessing(inputfile)

  if (yield from self.run_code(code, result)):


Index(['flatten done\nUnnamed: 0', 'flatten done\nUnnamed: 0.1',
       'flatten done\nchannelGrouping', 'flatten done\ndate',
       'flatten done\nfullVisitorId', 'flatten done\nsessionId',
       'flatten done\nsocialEngagementType', 'flatten done\nvisitId',
       'flatten done\nvisitNumber', 'flatten done\nvisitStartTime',
       'flatten done\nbounces', 'flatten done\nhits',
       'flatten done\nnewVisits', 'flatten done\npageviews',
       'flatten done\ntransactionRevenue', 'flatten done\nvisits',
       'flatten done\nbrowser', 'flatten done\ndeviceCategory',
       'flatten done\nisMobile', 'flatten done\noperatingSystem',
       'flatten done\nadContent', 'flatten done\nadwordsClickInfo',
       'flatten done\ncampaign', 'flatten done\nisTrueDirect',
       'flatten done\nkeyword', 'flatten done\nmedium',
       'flatten done\nreferralPath', 'flatten done\nsource',
       'flatten done\ncity', 'flatten done\ncontinent',
       'flatten done\ncountry', 'flatten done\nmetro',

In [27]:
df_base_model.columns

Index(['is_transaction', 'bounces', 'hits', 'pageviews', 'visitNumber',
       'is_source_googleplex', 'is_source_direct', 'is_source_google',
       'is_medium_referral', 'is_medium_none', 'is_medium_organic',
       'is_bayarea', 'Africa', 'Americas', 'Asia', 'Europe', 'Oceania',
       'next_session_1', 'last_session_1', 'q1', 'q2', 'q3', 'q4', 'weekday',
       'h_0_6', 'h_7_12', 'h_13_17', 'h_18_23', 'browser_chrome', 'is_mobile',
       'system_google'],
      dtype='object')

In [28]:
df_base_model.drop(columns='is_transaction')

Unnamed: 0,bounces,hits,pageviews,visitNumber,is_source_googleplex,is_source_direct,is_source_google,is_medium_referral,is_medium_none,is_medium_organic,...,q3,q4,weekday,h_0_6,h_7_12,h_13_17,h_18_23,browser_chrome,is_mobile,system_google
0,1.0,1.0,1.0,1,0,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,0
1,0.0,10.0,8.0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,1,1,1,1
2,0.0,11.0,8.0,1,0,0,0,0,0,1,...,0,1,1,1,0,0,0,1,0,0
3,0.0,17.0,13.0,1,0,0,0,0,0,1,...,0,1,1,0,1,0,0,1,0,0
4,0.0,3.0,2.0,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
5,0.0,6.0,5.0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
6,1.0,1.0,1.0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0
7,1.0,1.0,1.0,1,0,1,1,0,1,0,...,0,0,1,0,1,0,0,1,1,1
8,0.0,2.0,2.0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,0,0
9,0.0,2.0,2.0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    df_base_model.drop(columns='is_transaction'), df_base_model['is_transaction'], test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')





























































































  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.0034018349476908996, 0.24316760680397043, 0.006709801969744959, None)

In [None]:
df_base_model.columns

In [None]:
len(df_base_model.columns)