In [9]:
import pandas as pd
import json 
import numpy as np

In [10]:
def extract_traffic_source_columns(df):
    df['isTrueDirect']=df['isTrueDirect'].fillna(False)
    df['isTrueDirect']=df['isTrueDirect'].astype(int)
    
    is_source_mall_googleplex = (df['source']=='mall.googleplex.com').astype(int)
    is_source_mall_googleplex.name = 'is_source_googleplex'    
    
    is_source_direct = (df['source']=='(direct)').astype(int)
    is_source_direct.name = 'is_source_direct'
    
    is_source_google = (df['source']=='(direct)').astype(int)
    is_source_google.name = 'is_source_google'
    
    is_medium_referral = (df['medium']=='referral').astype(int)
    is_medium_referral.name = 'is_medium_referral'
    is_medium_none = (df['medium']=='(none)').astype(int)
    is_medium_none.name = 'is_medium_none'
    is_medium_organic = (df['medium']=='organic').astype(int)
    is_medium_organic.name="is_medium_organic"
    
    df = df.drop(columns = ['adContent','adwordsClickInfo','campaign', 'keyword', 'medium', 'referralPath',
                      'source'])
    
    return pd.concat([df,is_source_mall_googleplex,is_source_direct,is_source_google,is_medium_referral,
                    is_medium_none,is_medium_organic],axis=1)

In [11]:
def get_region_columns(df):
    df['is_bayarea'] = (df['metro'] =='San Francisco-Oakland-San Jose CA').astype(int)
    dummies = pd.get_dummies(df['continent'])
    dummies = dummies.drop(['(not set)'],axis=1)
    df = df.drop(['continent','city','subContinent','region','country','metro','networkDomain'],axis=1) 
    new_df = pd.concat([df,dummies],axis=1)
    return new_df

In [12]:
def extract_activities(df):
    df['is_transaction'] = 1 - df['transactionRevenue'].isna()
    df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)
    df['transactionRevenue'] = df['transactionRevenue'].fillna(0)
    return df

In [13]:
def time_value(df):
    import datetime as dt
    time = pd.to_datetime(df.loc[:,'visitStartTime'], unit='s')
    year = time.dt.year
    year = pd.get_dummies(year)
    month = time.dt.month
    q1 = (month <= 3) & (month >= 1)
    q2 = (month <= 6) & (month >= 4)
    q3 = (month <= 9) & (month >= 7)
    q4 = (month <= 12) & (month >= 10)
    dow = time.dt.weekday
    weekday = (dow <= 5) & (dow >=1)
    hour = time.dt.hour
    h_0_6 = (hour <= 6) & (hour >= 0)
    h_7_12 = (hour <= 12) & (hour >= 7)
    h_13_17 = (hour <= 17) & (hour >= 13)
    h_18_23 = (hour <= 23) & (hour >= 18)
    new_df = pd.DataFrame()
    new_df['q1'] = q1.astype(int)
    new_df['q2'] = q2.astype(int)
    new_df['q3'] = q3.astype(int)
    new_df['q4'] = q4.astype(int)
    new_df['weekday'] = weekday.astype(int)
    new_df['h_0_6'] = h_0_6.astype(int)
    new_df['h_7_12'] = h_7_12.astype(int)
    new_df['h_13_17'] = h_13_17.astype(int)
    new_df['h_18_23'] = h_18_23.astype(int)
    new_df['month'] = month
    new_df['dow'] = dow
    new_df['hour'] = hour
    return new_df

def device(df):
    browser_chrome = (df['browser']=='Chrome').astype(int)
    is_mobile = df['isMobile'].astype(int)
    system_google = ((df['operatingSystem']=='Android')|(df['operatingSystem']=='Chrome OS')).astype(int)
    new_df = pd.DataFrame()
    new_df['browser_chrome'] = browser_chrome
    new_df['is_mobile'] = is_mobile
    new_df['system_google'] = system_google
    return new_df

# preprocessing

In [14]:
def preprocessing(inputfile):
    df = pd.read_csv(inputfile)
    
    df.sort_values(['fullVisitorId', 'visitStartTime'], ascending=True, inplace=True)
    df['last_session_1'] = (
    df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(1)
)
    df['next_session_1']=(
    -1*(df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(-1))
)

    #df[['next_session_1','last_session_1']] = df[['next_session_1','last_session_1']]/86400
    df[['next_session_1','last_session_1']] = df[['next_session_1','last_session_1']].fillna(0)
    print(df.columns)
    df = extract_traffic_source_columns(df)
    # print("Congrats Brian\n"+df.columns)
    df = get_region_columns(df)
    # print("Congrats Ziyu\n"+df.columns)
    df = extract_activities(df)
    # print("Congrats Yixin\n"+df.columns)
    col_time = time_value(df)
    col_device = device(df)
    
    df_base_model = df[['fullVisitorId', 'is_transaction', 'transactionRevenue', # add three columns
                        'bounces', 'hits', 'pageviews', 'visitNumber',
                       'is_source_googleplex','is_source_direct','is_source_google','is_medium_referral',
                    'is_medium_none','is_medium_organic',
                        'is_bayarea','Africa','Americas','Asia','Europe','Oceania',
                        'next_session_1','last_session_1'
                       ]]
    df_base_model = pd.concat([df_base_model,col_time],axis=1)
    df_base_model = pd.concat([df_base_model,col_device],axis=1)
    # print("Congrats Xi\n"+df_base_model.columns)
    return df_base_model

In [15]:
inputfile = '../data/train_flatten.csv'
df_base_model = preprocessing(inputfile)

Index(['Unnamed: 0', 'channelGrouping', 'date', 'fullVisitorId', 'sessionId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue',
       'visits', 'browser', 'deviceCategory', 'isMobile', 'operatingSystem',
       'adContent', 'adwordsClickInfo', 'campaign', 'isTrueDirect', 'keyword',
       'medium', 'referralPath', 'source', 'city', 'continent', 'country',
       'metro', 'networkDomain', 'region', 'subContinent', 'last_session_1',
       'next_session_1'],
      dtype='object')


# aggregate in user level

In [16]:
df_base_model.transactionRevenue = df_base_model.transactionRevenue.fillna(0)

In [17]:
df_base_model.columns

Index(['fullVisitorId', 'is_transaction', 'transactionRevenue', 'bounces',
       'hits', 'pageviews', 'visitNumber', 'is_source_googleplex',
       'is_source_direct', 'is_source_google', 'is_medium_referral',
       'is_medium_none', 'is_medium_organic', 'is_bayarea', 'Africa',
       'Americas', 'Asia', 'Europe', 'Oceania', 'next_session_1',
       'last_session_1', 'q1', 'q2', 'q3', 'q4', 'weekday', 'h_0_6', 'h_7_12',
       'h_13_17', 'h_18_23', 'month', 'dow', 'hour', 'browser_chrome',
       'is_mobile', 'system_google'],
      dtype='object')

In [18]:
id_variables = ['fullVisitorId']
y_c_variables = ['transactionRevenue']
y_d_variables = ['is_transaction'] # classification
c_variables = ['bounces', 'hits', 'pageviews'] # continuous
i_variables = ['visitNumber']
t_variables = ['next_session_1', 'last_session_1']
d_variables = ['is_source_googleplex',
       'is_source_direct', 'is_source_google', 'is_medium_referral',
       'is_medium_none', 'is_medium_organic', 'is_bayarea', 'Africa',
       'Americas', 'Asia', 'Europe', 'Oceania', 'q1', 'q2', 'q3', 'q4', 'weekday', 'h_0_6', 'h_7_12',
       'h_13_17', 'h_18_23', 'month', 'dow', 'hour', 'browser_chrome',
       'is_mobile', 'system_google'] # dummies

In [19]:
def agg_time(x):
    if len(x)==1:
        return 0
    else:
        return np.log(1+sum(x)/(len(x)-1))

In [20]:
def make_agg(v):
    if v in y_c_variables:
        return {
            v: {v + '_' + 'total': lambda x: np.log(1 + sum(x))}
               }
    if v in y_d_variables:
        return {
            v: {v: 'max'}
        }
    if v in c_variables:
        return {
            v: {v + '_' + 'mean': 'mean'}
        }
    if v in i_variables:
        return {
            v: {v + '_' + 'total': 'max'}
        }
    if v in t_variables:
        return {
            v: {v + '_' + 'adjmean': lambda x: agg_time(x)}
        }
    if v in d_variables:
        return {
            v: {
                v + '_' + 'freq': 'sum',
                v + '_' + 'rate': 'mean'
            }
        }

In [21]:
aggregation = dict()
for v in df_base_model.columns:
    if v not in id_variables:
        agg_new = make_agg(v)
        aggregation = {**aggregation, **agg_new}

In [22]:
df_user = df_base_model.groupby('fullVisitorId', as_index=False).agg(aggregation)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [23]:
df_user.columns = df_user.columns.droplevel(0)
new_col = ['fullVisitorId']
new_col.extend(df_user.columns[1:])
df_user.columns = new_col

In [24]:
df_user.head()

Unnamed: 0,fullVisitorId,is_transaction,transactionRevenue_total,bounces_mean,hits_mean,pageviews_mean,visitNumber_total,is_source_googleplex_freq,is_source_googleplex_rate,is_source_direct_freq,...,dow_freq,dow_rate,hour_freq,hour_rate,browser_chrome_freq,browser_chrome_rate,is_mobile_freq,is_mobile_rate,system_google_freq,system_google_rate
0,4823595352351,0,0.0,1.0,1.0,1.0,1,0,0.0,0,...,1,1.0,14,14.0,1,1.0,0,0.0,0,0.0
1,5103959234087,0,0.0,0.0,10.0,8.0,1,0,0.0,0,...,6,6.0,22,22.0,1,1.0,1,1.0,1,1.0
2,10278554503158,0,0.0,0.0,11.0,8.0,1,0,0.0,0,...,4,4.0,5,5.0,1,1.0,0,0.0,0,0.0
3,20424342248747,0,0.0,0.0,17.0,13.0,1,0,0.0,0,...,3,3.0,7,7.0,1,1.0,0,0.0,0,0.0
4,26722803385797,0,0.0,0.0,3.0,2.0,1,0,0.0,0,...,0,0.0,10,10.0,0,0.0,0,0.0,0,0.0


In [25]:
df_user.to_csv('../data/train_user.csv', sep=',')

# try classification model

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_recall_fscore_support

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    df_user.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue_total'], axis=1), 
    df_user['is_transaction'], test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


(0.0012341916922707981, 0.07780827194376234, 0.002429841338577941, None)

# try regression model

In [29]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(
    df_user.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue_total'], axis=1), 
    df_user['transactionRevenue_total'], test_size=0.33, random_state=42)

In [30]:
from sklearn.metrics import mean_squared_error

In [31]:
def fit_regression(model, X, y) -> float:
    "Fit a regression model, returning mse. Use code above an example."
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.30,
                                                        random_state=42)

    ### BEGIN SOLUTION
    reg = model()
    reg.fit(X_train, y_train)
    y_predicted = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_predicted)
    return mse
    ### END SOLUTION

In [32]:
fit_regression(LinearRegression, df_user.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue_total'], axis=1),
              df_user['transactionRevenue_total'])

3.347180975435582

In [33]:
glm = LinearRegression()
reg = glm.fit(X_train, y_train)
reg.score(X_train, y_train)

0.22106861818361134

# try logistic regression

In [34]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(
    df_user.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue_total'], axis=1), 
    df_user['is_transaction'], test_size=0.33, random_state=42)

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
def fit_classification(model, X, y) -> float:
    "Fit a regression model, returning mse. Use code above an example."
    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, 
                                                        test_size=0.30,
                                                        random_state=42)

    ### BEGIN SOLUTION
    reg = model()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    confusion_matrix = confusion_matrix(y_test, y_pred)
    return (confusion_matrix, reg.score(X_test, y_test))
    ### END SOLUTION

In [38]:
X = df_user.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue_total'], axis=1)
y = df_user['is_transaction']

In [42]:
from matplotlib import pyplot as plt

In [None]:
plt.plot

In [39]:
X.columns

Index(['bounces_mean', 'hits_mean', 'pageviews_mean', 'visitNumber_total',
       'is_source_googleplex_freq', 'is_source_googleplex_rate',
       'is_source_direct_freq', 'is_source_direct_rate',
       'is_source_google_freq', 'is_source_google_rate',
       'is_medium_referral_freq', 'is_medium_referral_rate',
       'is_medium_none_freq', 'is_medium_none_rate', 'is_medium_organic_freq',
       'is_medium_organic_rate', 'is_bayarea_freq', 'is_bayarea_rate',
       'Africa_freq', 'Africa_rate', 'Americas_freq', 'Americas_rate',
       'Asia_freq', 'Asia_rate', 'Europe_freq', 'Europe_rate', 'Oceania_freq',
       'Oceania_rate', 'next_session_1_adjmean', 'last_session_1_adjmean',
       'q1_freq', 'q1_rate', 'q2_freq', 'q2_rate', 'q3_freq', 'q3_rate',
       'q4_freq', 'q4_rate', 'weekday_freq', 'weekday_rate', 'h_0_6_freq',
       'h_0_6_rate', 'h_7_12_freq', 'h_7_12_rate', 'h_13_17_freq',
       'h_13_17_rate', 'h_18_23_freq', 'h_18_23_rate', 'month_freq',
       'month_rate', 'dow_

In [40]:
fit_classification(LogisticRegression, X, y)



(array([[213711,    519],
        [  2176,    735]]), 0.9875887096402798)