In [23]:
import pandas as pd
import json 
import numpy as np
import pylab
import warnings; warnings.simplefilter('ignore')

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_recall_fscore_support

# preprocessing and first fit

In [2]:
def extract_traffic_source_columns(df):
    df['isTrueDirect']=df['isTrueDirect'].fillna(False)
    df['isTrueDirect']=df['isTrueDirect'].astype(int)
    
    is_source_mall_googleplex = (df['source']=='mall.googleplex.com').astype(int)
    is_source_mall_googleplex.name = 'is_source_googleplex'    
    
    is_source_direct = (df['source']=='(direct)').astype(int)
    is_source_direct.name = 'is_source_direct'
    
    is_source_google = (df['source']=='(direct)').astype(int)
    is_source_google.name = 'is_source_google'
    
    is_medium_referral = (df['medium']=='referral').astype(int)
    is_medium_referral.name = 'is_medium_referral'
    is_medium_none = (df['medium']=='(none)').astype(int)
    is_medium_none.name = 'is_medium_none'
    is_medium_organic = (df['medium']=='organic').astype(int)
    is_medium_organic.name="is_medium_organic"
    
    df = df.drop(columns = ['adContent','adwordsClickInfo','campaign', 'keyword', 'medium', 'referralPath',
                      'source'])
    
    return pd.concat([df,is_source_mall_googleplex,is_source_direct,is_source_google,is_medium_referral,
                    is_medium_none,is_medium_organic],axis=1)

In [3]:
def get_region_columns(df):
    df['is_bayarea'] = (df['metro'] =='San Francisco-Oakland-San Jose CA').astype(int)
    dummies = pd.get_dummies(df['continent'])
    dummies = dummies.drop(['(not set)'],axis=1)
    df = df.drop(['continent','city','subContinent','region','country','metro','networkDomain'],axis=1) 
    new_df = pd.concat([df,dummies],axis=1)
    return new_df

In [4]:
def extract_activities(df):
    df['is_transaction'] = 1 - df['transactionRevenue'].isna()
    df[['bounces', 'hits', 'pageviews', 'visitNumber']] = df[['bounces', 'hits', 'pageviews', 'visitNumber']].fillna(0)
    df['transactionRevenue'] = df['transactionRevenue'].fillna(0)
    return df
    

In [5]:
def time_value(df):
    import datetime as dt
    time = pd.to_datetime(df.loc[:,'visitStartTime'], unit='s')
    year = time.dt.year
    year = pd.get_dummies(year)
    month = time.dt.month
    q1 = (month <= 3) & (month >= 1)
    q2 = (month <= 6) & (month >= 4)
    q3 = (month <= 9) & (month >= 7)
    q4 = (month <= 12) & (month >= 10)
    dow = time.dt.weekday
    weekday = (dow <= 5) & (dow >=1)
    hour = time.dt.hour
    h_0_6 = (hour <= 6) & (hour >= 0)
    h_7_12 = (hour <= 12) & (hour >= 7)
    h_13_17 = (hour <= 17) & (hour >= 13)
    h_18_23 = (hour <= 23) & (hour >= 18)
    new_df = pd.DataFrame()
    new_df['q1'] = q1.astype(int)
    new_df['q2'] = q2.astype(int)
    new_df['q3'] = q3.astype(int)
    new_df['q4'] = q4.astype(int)
    new_df['weekday'] = weekday.astype(int)
    new_df['h_0_6'] = h_0_6.astype(int)
    new_df['h_7_12'] = h_7_12.astype(int)
    new_df['h_13_17'] = h_13_17.astype(int)
    new_df['h_18_23'] = h_18_23.astype(int)
    new_df['month'] = month
    new_df['dow'] = dow
    new_df['hour'] = hour
    return new_df

def device(df):
    browser_chrome = (df['browser']=='Chrome').astype(int)
    is_mobile = df['isMobile'].astype(int)
    system_google = ((df['operatingSystem']=='Android')|(df['operatingSystem']=='Chrome OS')).astype(int)
    new_df = pd.DataFrame()
    new_df['browser_chrome'] = browser_chrome
    new_df['is_mobile'] = is_mobile
    new_df['system_google'] = system_google
    return new_df

In [27]:
df = pd.read_csv(inputfile)

In [40]:
def normalization(col):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    col = np.array(col.fillna(0)).reshape(-1,1)
    col = scaler.fit_transform(col)
    return col
    

In [41]:
new_rev = normalization(df.transactionRevenue)

In [42]:
new_rev.shape

(903653, 1)

In [45]:
def preprocessing(inputfile):
    df = pd.read_csv(inputfile)
    
    df.sort_values(['fullVisitorId', 'visitStartTime'], ascending=True, inplace=True)
    df['last_session_1'] = (
    df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(1)
)
    df['next_session_1']=(
    -1*(df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(-1))
)

    #df[['next_session_1','last_session_1']] = df[['next_session_1','last_session_1']]/86400
    df[['next_session_1','last_session_1']] = df[['next_session_1','last_session_1']].fillna(0)
    print(df.columns)
    df = extract_traffic_source_columns(df)
    # print("Congrats Brian\n"+df.columns)
    df = get_region_columns(df)
    # print("Congrats Ziyu\n"+df.columns)
    df = extract_activities(df)
    # print("Congrats Yixin\n"+df.columns)
    col_time = time_value(df)
    col_device = device(df)
    
    df_base_model = df[['bounces', 'hits', 'pageviews', 'visitNumber',
                       'is_source_googleplex','is_source_direct','is_source_google','is_medium_referral',
                    'is_medium_none','is_medium_organic',
                        'is_bayarea','Africa','Americas','Asia','Europe','Oceania',
                        'next_session_1','last_session_1'
                       ]]
    df_base_model = pd.concat([df_base_model,col_time],axis=1)
    df_base_model = pd.concat([df_base_model,col_device],axis=1)
    df_istransaction = df['is_transaction']
    df_transaction = normalization(df['transactionRevenue'])
    # print("Congrats Xi\n"+df_base_model.columns)
    return df_base_model, df_istransaction, df_transaction

In [47]:
inputfile = './data/train_flatten.csv'
df_base_model, df_istransaction, df_transaction = preprocessing(inputfile)

Index(['Unnamed: 0', 'channelGrouping', 'date', 'fullVisitorId', 'sessionId',
       'socialEngagementType', 'visitId', 'visitNumber', 'visitStartTime',
       'bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue',
       'visits', 'browser', 'deviceCategory', 'isMobile', 'operatingSystem',
       'adContent', 'adwordsClickInfo', 'campaign', 'isTrueDirect', 'keyword',
       'medium', 'referralPath', 'source', 'city', 'continent', 'country',
       'metro', 'networkDomain', 'region', 'subContinent', 'last_session_1',
       'next_session_1'],
      dtype='object')


# classification

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    df_base_model, df_istransaction, test_size=0.33, random_state=42)
estimator = EllipticEnvelope()
y_predicted = estimator.fit_predict(X_train, y_train)
precision_recall_fscore_support(y_train, y_predicted, average='macro')

(0.0034006114861020883, 0.24308015217106127, 0.006707388805936383, None)

# regression on revenue

In [48]:
from sklearn.linear_model import LinearRegression
X_train, X_test, y_train, y_test = train_test_split(
    df_base_model, df_transaction, test_size=0.33, random_state=42)

In [49]:
glm = LinearRegression()
reg = glm.fit(X_train, y_train)
reg.score(X_train, y_train)

0.025812613428511932

In [159]:
df_base_model.columns

Index(['is_transaction', 'bounces', 'hits', 'pageviews', 'visitNumber',
       'is_source_googleplex', 'is_source_direct', 'is_source_google',
       'is_medium_referral', 'is_medium_none', 'is_medium_organic',
       'is_bayarea', 'Africa', 'Americas', 'Asia', 'Europe', 'Oceania', 'q1',
       'q2', 'q3', 'q4', 'weekday', 'h_0_6', 'h_7_12', 'h_13_17', 'h_18_23',
       'browser_chrome', 'is_mobile', 'system_google'],
      dtype='object')

In [161]:
len(df_base_model.columns)

29

In [164]:
sum(y_predicted)

484373

In [165]:
sum(y_train)

7717

In [166]:
sum(abs(y_predicted - y_train))

610388