In [None]:
import pandas as pd
import json 
import numpy as np
import pylab

In [None]:
"""
A function to flatten the columns
"""
def flatten_all(inputfile, outputfile,
                fcol_to_flat, scol_to_flat, col_to_del):
    """
    Flatten all the columns with dictionaries in.
    Append all useful column at the end of data frame
    Return a data frame and write into csv
    """
    raw = pd.read_csv(inputfile, delimiter=',', skipinitialspace=True)
    # deal the float column
    f_col = raw[fcol_to_flat].apply(json.loads)
    df = raw.join(pd.DataFrame(f_col.tolist()).astype('float'))
    # deal with the string column
    for col in scol_to_flat:
        s_col = raw[col].apply(json.loads)
        df = df.join(pd.DataFrame(s_col.tolist()))
    df = df.drop(col_to_del, axis=1)
    df = df.drop(scol_to_flat, axis=1)
    df = df.drop(fcol_to_flat, axis=1)
    df.to_csv(outputfile, sep=',')
    return df

In [None]:
fcol_to_flat = 'totals'
scol_to_flat = ['device', 'trafficSource', 'geoNetwork']
col_to_del = ['campaignCode', 
              'cityId', 'latitude', 'longitude', 'networkLocation', 
             'browserSize', 'browserVersion', 'flashVersion', 'language',
              'mobileDeviceBranding', 'mobileDeviceInfo', 
              'mobileDeviceMarketingName', 'mobileDeviceModel',
              'mobileInputSelector','operatingSystemVersion',
              'screenColors', 'screenResolution']
inputfile = './data/train.csv'
outputfile = './data/train_flatten.csv'

In [None]:
df = flatten_all(inputfile, outputfile, 
                 fcol_to_flat, scol_to_flat, col_to_del)

In [None]:
df = df.drop(columns="socialEngagementType")

In [None]:
df['is_transaction'] = 1 - df['transactionRevenue'].isna()

In [None]:
df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)

In [None]:
grouped = df.groupby('is_transaction')
grouped.sum()

## Explore meaningful columns
visitNumber, bounces, hits, pageviews, **isMobile**

In [None]:
## Aggregate
df_yixin = df[['fullVisitorId','is_transaction','bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue', 'visits', 'visitNumber','isMobile']]
grouped_visitor = df_yixin.groupby('fullVisitorId')

In [None]:
grouped_visitor.max()[['is_transaction', 'visitNumber']]

In [None]:
grouped_visitor.mean()[['bounces', 'hits','newVisits', 'pageviews','visits','isMobile']]

In [None]:
df_yixin_agg = grouped_visitor.max()[['is_transaction', 'visitNumber']].join(grouped_visitor.mean()[['bounces', 'hits','newVisits', 'pageviews','visits','isMobile']], on='fullVisitorId')

In [None]:
grouped = df_yixin_agg.groupby('is_transaction')

In [None]:
grouped.mean()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_recall_fscore_support

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['visitNumber','bounces','hits','pageviews','isMobile']].fillna(0), df['is_transaction'], test_size=0.33, random_state=42)

In [None]:
X_train.iloc[0:10,:]

In [None]:
y_predicted = estimator.fit_predict(X_train, y_train)

In [None]:
y_predicted

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['visitNumber','bounces','hits','pageviews','isMobile']].fillna(0), df['is_transaction'], test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')

# preprocessing and first fit

In [None]:
def extract_traffic_source_columns(df):
    df['isTrueDirect']=df['isTrueDirect'].fillna(False)
    df['isTrueDirect']=df['isTrueDirect'].astype(int)
    
    is_source_mall_googleplex = (df['source']=='mall.googleplex.com').astype(int)
    is_source_mall_googleplex.name = 'is_source_googleplex'    
    
    is_source_direct = (df['source']=='(direct)').astype(int)
    is_source_direct.name = 'is_source_direct'
    
    is_source_google = (df['source']=='(direct)').astype(int)
    is_source_google.name = 'is_source_google'
    
    is_medium_referral = (df['medium']=='referral').astype(int)
    is_medium_referral.name = 'is_medium_referral'
    is_medium_none = (df['medium']=='(none)').astype(int)
    is_medium_none.name = 'is_medium_none'
    is_medium_organic = (df['medium']=='organic').astype(int)
    is_medium_organic.name="is_medium_organic"
    
    df = df.drop(columns = ['adContent','adwordsClickInfo','campaign', 'keyword', 'medium', 'referralPath',
                      'source'])
    
    return pd.concat([df,is_source_mall_googleplex,is_source_direct,is_source_google,is_medium_referral,
                    is_medium_none,is_medium_organic],axis=1)

In [None]:
def get_region_columns(df):
    df['is_bayarea'] = (df['metro'] =='San Francisco-Oakland-San Jose CA').astype(int)
    dummies = pd.get_dummies(df['continent'])
    dummies = dummies.drop(['(not set)'],axis=1)
    df = df.drop(['continent','city','subContinent','region','country','metro','networkDomain'],axis=1) 
    new_df = pd.concat([df,dummies],axis=1)
    return new_df

In [None]:
def extract_activities(df):
    df['is_transaction'] = 1 - df['transactionRevenue'].isna()
    df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)
    return df
    

In [None]:
def time_value(df):
    import datetime as dt
    time = pd.to_datetime(df.loc[:,'visitStartTime'], unit='s')
    year = time.dt.year
    year = pd.get_dummies(year)
    month = time.dt.month
    q1 = (month <= 3) & (month >= 1)
    q2 = (month <= 6) & (month >= 4)
    q3 = (month <= 9) & (month >= 7)
    q4 = (month <= 12) & (month >= 10)
    dow = time.dt.weekday
    weekday = (dow <= 5) & (dow >=1)
    hour = time.dt.hour
    h_0_6 = (hour <= 6) & (hour >= 0)
    h_7_12 = (hour <= 12) & (hour >= 7)
    h_13_17 = (hour <= 17) & (hour >= 13)
    h_18_23 = (hour <= 23) & (hour >= 18)
    new_df = pd.DataFrame()
    new_df['q1'] = q1.astype(int)
    new_df['q2'] = q2.astype(int)
    new_df['q3'] = q3.astype(int)
    new_df['q4'] = q4.astype(int)
    new_df['weekday'] = weekday.astype(int)
    new_df['h_0_6'] = h_0_6.astype(int)
    new_df['h_7_12'] = h_7_12.astype(int)
    new_df['h_13_17'] = h_13_17.astype(int)
    new_df['h_18_23'] = h_18_23.astype(int)
    return new_df

def device(df):
    browser_chrome = (df['browser']=='Chrome').astype(int)
    is_mobile = df['isMobile'].astype(int)
    system_google = ((df['operatingSystem']=='Android')|(df['operatingSystem']=='Chrome OS')).astype(int)
    new_df = pd.DataFrame()
    new_df['browser_chrome'] = browser_chrome
    new_df['is_mobile'] = is_mobile
    new_df['system_google'] = system_google
    return new_df

In [None]:
def preprocessing(inputfile):
    df = pd.read_csv(inputfile)
    print("flatten done\n"+df.columns)
    df = extract_traffic_source_columns(df)
    print("Congrats Brian\n"+df.columns)
    df = get_region_columns(df)
    print("Congrats Ziyu\n"+df.columns)
    df = extract_activities(df)
    print("Congrats Yixin\n"+df.columns)
    col_time = time_value(df)
    col_device = device(df)
    
    df_base_model = df[['is_transaction','bounces', 'hits', 'pageviews', 'visitNumber',
                       'is_source_googleplex','is_source_direct','is_source_google','is_medium_referral',
                    'is_medium_none','is_medium_organic',
                        'is_bayarea','Africa','Americas','Asia','Europe','Oceania'
                       ]]
    df_base_model = pd.concat([df_base_model,col_time],axis=1)
    df_base_model = pd.concat([df_base_model,col_device],axis=1)
    print("Congrats Xi\n"+df_base_model.columns)
    return df_base_model

In [None]:
inputfile = './data/train_flatten.csv'
df_base_model = preprocessing(inputfile)

In [None]:
df.columns

In [157]:
df_base_model.drop(columns='is_transaction')

Unnamed: 0,bounces,hits,pageviews,visitNumber,is_source_googleplex,is_source_direct,is_source_google,is_medium_referral,is_medium_none,is_medium_organic,...,q3,q4,weekday,h_0_6,h_7_12,h_13_17,h_18_23,browser_chrome,is_mobile,system_google
0,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,0,1,0,1,0,0
1,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,1,0,0,0,0,0,0
2,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,1,0,0,0,1,0,0
3,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,1,0,0,0,0,0,0
4,1.0,1.0,1.0,2,0,0,0,0,0,1,...,1,0,1,0,0,1,0,1,1,1
5,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,1,0,0,1,0,0
6,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,1,0,0,1,0,0
7,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,1,0,0,1,0,0
8,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,1,0,0,0,0,0
9,1.0,1.0,1.0,1,0,0,0,0,0,1,...,1,0,1,0,1,0,0,0,0,0


In [158]:
X_train, X_test, y_train, y_test = train_test_split(
    df_base_model.drop(columns='is_transaction'), df_base_model['is_transaction'], test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.0008490698769827433, 0.05995421364087944, 0.0016744265722328682, None)

In [159]:
df_base_model.columns

Index(['is_transaction', 'bounces', 'hits', 'pageviews', 'visitNumber',
       'is_source_googleplex', 'is_source_direct', 'is_source_google',
       'is_medium_referral', 'is_medium_none', 'is_medium_organic',
       'is_bayarea', 'Africa', 'Americas', 'Asia', 'Europe', 'Oceania', 'q1',
       'q2', 'q3', 'q4', 'weekday', 'h_0_6', 'h_7_12', 'h_13_17', 'h_18_23',
       'browser_chrome', 'is_mobile', 'system_google'],
      dtype='object')

In [161]:
len(df_base_model.columns)

29