# Google Analytics Revenue Prediction

## Import Data

**Environment**

In [1]:
import pandas as pd
import numpy as np

# Pandas defaults
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500
 
# Make jupyter bigger
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [2]:
from myfunctions import *

### Import dataset

In [3]:
# Import dataset by chunks
size = 300000 # Number of rows for each chunk
df_chunk = pd.read_csv(r'data/newtrain.csv', chunksize=size, low_memory=False)
chunk_list = []  # append each chunk df here 

# Each chunk is in df format
for chunk in df_chunk:  
    #append the chunk to list
    chunk_list.append(chunk)
    
# concat the list into dataframe 
df = pd.concat(chunk_list)

### Basic info

In [4]:
df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.browser,device.browserSize,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,Chrome,,...,,,,,,,,organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,Firefox,,...,,,,,,,,organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,Chrome,,...,,,,,,,,organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,UC Browser,,...,,,,,,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,Chrome,,...,,,,,,True,,organic,,google


In [5]:
df.shape

(903653, 55)

In [6]:
df.dtypes

channelGrouping                                       object
date                                                   int64
fullVisitorId                                         object
sessionId                                             object
socialEngagementType                                  object
visitId                                                int64
visitNumber                                            int64
visitStartTime                                         int64
device.browser                                        object
device.browserSize                                   float64
device.browserVersion                                float64
device.deviceCategory                                 object
device.flashVersion                                  float64
device.isMobile                                         bool
device.language                                      float64
device.mobileDeviceBranding                          float64
device.mobileDeviceInfo 

In [7]:
df.isnull().sum()

channelGrouping                                           0
date                                                      0
fullVisitorId                                             0
sessionId                                                 0
socialEngagementType                                      0
visitId                                                   0
visitNumber                                               0
visitStartTime                                            0
device.browser                                            8
device.browserSize                                   903653
device.browserVersion                                903653
device.deviceCategory                                     0
device.flashVersion                                  903653
device.isMobile                                           0
device.language                                      903653
device.mobileDeviceBranding                          903653
device.mobileDeviceInfo                 

### Delete empty columns

In [8]:
#Search the empty columns
empty_cols = [col for col in df.columns if df[col].isnull().sum() == 903653]

In [9]:
empty_cols

['device.browserSize',
 'device.browserVersion',
 'device.flashVersion',
 'device.language',
 'device.mobileDeviceBranding',
 'device.mobileDeviceInfo',
 'device.mobileDeviceMarketingName',
 'device.mobileDeviceModel',
 'device.mobileInputSelector',
 'device.operatingSystemVersion',
 'device.screenColors',
 'device.screenResolution',
 'geoNetwork.cityId',
 'geoNetwork.latitude',
 'geoNetwork.longitude',
 'geoNetwork.networkLocation',
 'trafficSource.adwordsClickInfo.criteriaParameters']

In [10]:
#drop the empty columns
df = df.drop(columns=empty_cols)

In [11]:
df.shape

(903653, 38)

In [12]:
df.isnull().sum()

channelGrouping                                      0
date                                                 0
fullVisitorId                                        0
sessionId                                            0
socialEngagementType                                 0
visitId                                              0
visitNumber                                          0
visitStartTime                                       0
device.browser                                       8
device.deviceCategory                                0
device.isMobile                                      0
device.operatingSystem                            4695
geoNetwork.city                                 542491
geoNetwork.continent                              1468
geoNetwork.country                                1468
geoNetwork.metro                                709995
geoNetwork.networkDomain                        244881
geoNetwork.region                               536056
geoNetwork

### Export dataframe 

In [13]:
df.to_csv("data/df2.csv", index=False)