# Google Analytics Customer Revenue Prediction

## Data Understanding and Exploration

#### Importing Libraries

In [92]:
import numpy as np
import pandas as pd

import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)
# %matplotlib inline

### Loading Training data

In [93]:
train_df = pd.read_csv('../Data/preprocessed/train.csv', low_memory=False)
train_df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,not available in demo dataset,...,,,,(not set),,,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,not available in demo dataset,...,,,,(not set),,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,not available in demo dataset,...,,,,(not set),,True,(not provided),organic,,google


#### Column names

In [94]:
train_df.dtypes

channelGrouping                                       object
date                                                   int64
fullVisitorId                                         object
sessionId                                             object
socialEngagementType                                  object
visitId                                                int64
visitNumber                                            int64
visitStartTime                                         int64
device.browser                                        object
device.browserSize                                    object
device.browserVersion                                 object
device.deviceCategory                                 object
device.flashVersion                                   object
device.isMobile                                         bool
device.language                                       object
device.mobileDeviceBranding                           object
device.mobileDeviceInfo 

In [95]:
columns = train_df.columns
for i in columns:
    print("Column name = {}".format(i))
    print(pd.value_counts(train_df[i]).head())
    print("---------------------------------------------")

Column name = channelGrouping
Organic Search    381561
Social            226117
Direct            143026
Referral          104838
Paid Search        25326
Name: channelGrouping, dtype: int64
---------------------------------------------
Column name = date
20161128    4807
20161115    4685
20161114    4466
20161130    4435
20161026    4375
Name: date, dtype: int64
---------------------------------------------
Column name = fullVisitorId
1957458976293878100    278
0824839726118485274    255
3608475193341679870    201
1856749147915772585    199
3269834865385146569    155
Name: fullVisitorId, dtype: int64
---------------------------------------------
Column name = sessionId
5357626062210322502_1490770441    2
4812091368309143012_1491374749    2
3479032932032964519_1481788514    2
6344542374277652593_1495522728    2
9697929460014674172_1475733990    2
Name: sessionId, dtype: int64
---------------------------------------------
Column name = socialEngagementType
Not Socially Engaged    903653

not available in demo dataset    903653
Name: trafficSource.adwordsClickInfo.criteriaParameters, dtype: int64
---------------------------------------------
Column name = trafficSource.adwordsClickInfo.gclId
Cj0KEQjwmIrJBRCRmJ_x7KDo-9oBEiQAuUPKMufMpuG3ZdwYO8GTsjiBFd5MPHStZa9y_9NCrI8X97oaAglc8P8HAQ    70
Cj0KEQjw1ee_BRD3hK6x993YzeoBEiQA5RH_BEA562M9tvl_mtnAFvtDnDqOQRp1RvxMMgwjcX1LAfwaAj4o8P8HAQ    41
CJH1vbf94M8CFUElgQodyakHgQ                                                                    29
Cj0KEQiAw_DEBRChnYiQ_562gsEBEiQA4LcssmB_RWgvpPnltzlzj5rGwqx5lk87wC5CjfcqzneNZewaAiAp8P8HAQ    27
CjwKEAiAj7TCBRCp2Z22ue-zrj4SJACG7SBEJui6ggr6ocA-eDC2-lX7W1m5IA1c_qNbzwZVTqUanxoCb5rw_wcB      24
Name: trafficSource.adwordsClickInfo.gclId, dtype: int64
---------------------------------------------
Column name = trafficSource.adwordsClickInfo.isVideoAd
False    21460
Name: trafficSource.adwordsClickInfo.isVideoAd, dtype: int64
---------------------------------------------
Column name = trafficSource.

### Useless Columns
From above exploration we found that the following columns don't make sense in data. <br>
Either these data are having NaN values, values not provided in dataset or contains single value.
    1. socialEngagementType 
    2. device.browserSize
    3. device.browserVersion
    4. device.flashVersion
    5. device.language
    6. device.mobileDeviceBranding
    7. device.mobileDeviceInfo
    8. device.mobileDeviceMarketingName
    9. device.mobileDeviceModel
    10. device.mobileInputSelector
    11. device.operatingSystemVersion
    12. device.screenColors
    13. device.screenResolution
    14. geoNetwork.cityId
    15. geoNetwork.latitude
    16. geoNetwork.longitude
    17. geoNetwork.networkLocation
    18. totals.visits
    19. trafficSource.adwordsClickInfo.criteriaParameters
    20. trafficSource.adwordsClickInfo.isVideoAd

### Exploring NaN or Null Values

In [105]:
nan_columns = train_df.isnull().sum()[train_df.isnull().sum()!=0]/train_df.shape[0]*100
nan_columns = nan_columns.sort_values(ascending=False)

In [106]:
nan_trace = [go.Bar(
    x = nan_columns.index,
    y = nan_columns,
    marker = dict(
        color = 'rgba(122, 120, 168, 0.8)',
        line = dict(
            color = 'rgba(71, 58, 131, 0.8)',
            width = 3)
    )
)]

layout = go.Layout(
    title= "Columns containing NaN vaues (Ratio in %)",
)
nan_fig = go.Figure(data=nan_trace, layout=layout)
py.iplot(nan_fig, filename='nan-plot')