#### The output .csv file will contain the following fields
    - channelGrouping
    - fullVisitorId
    - visitId
    - visitNumber
    - visitStartTime
    - browser
    - deviceCategory
    - operatingSystem
    - city
    - country
    - region
    - subContinent
    - hits
    - newVisits
    - *timeOnSite*
    - totalTransactionRevenue
    - *transactions*
    - adHasContent
    - adPosition
    - campaign
    - isTrueDirect
    - medium
    - source

In [1]:
import pandas as pd
import glob
import json
import numpy as np

pd.set_option('display.max_columns', 30)

In [2]:
# recursive flatten function from https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
# Call example:
  # flattenCol = [flatten_json(json.loads(d)) for d in df['trafficSource']]
  # dataFrameOfFlattenCol = pd.DataFrame(flattenCol)
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [None]:
# include here the bash code to separate the .csv files

In [23]:
allFiles = glob.glob("../data/splitted/*.csv")
df = pd.DataFrame({'channelGrouping': [],
                    'device': [],
                    'fullVisitorId': [],
                    'geoNetwork': [],
                    'socialEngagementType': [],
                    'totals': [],
                    'trafficSource': [],
                    'visitId': [],
                    'visitNumber': [],
                    'visitStartTime': []})

for fl in allFiles:
    temp = pd.read_csv(fl, usecols=["channelGrouping", "customDimensions", "date", "device", "fullVisitorId", "geoNetwork", "hits",
                                      "socialEngagementType", "totals", "trafficSource", "visitId", "visitNumber",
                                      "visitStartTime"], header=0, dtype={'fullVisitorId': 'str', 'visitId': 'str'})
    # we leave out hits, customDimensions and date
    temp.drop(columns=["hits", "customDimensions", "date"], axis=1, inplace=True)
    df = df.append(temp, ignore_index=True)

In [4]:
df.describe(include='all')

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
count,1508338,1508338,1508338.0,1508338,1508338,1508338,1508338,1508338.0,1508338.0,1508338.0
unique,8,285,1182994.0,56046,1,229104,68175,1470043.0,,
top,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1.957458976293878e+18,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1513124981.0,,
freq,651980,445205,342.0,117818,1508338,313596,413294,28.0,,
mean,,,,,,,,,2.334044,1498491000.0
std,,,,,,,,,9.35342,16213460.0
min,,,,,,,,,1.0,1470035000.0
25%,,,,,,,,,1.0,1483363000.0
50%,,,,,,,,,1.0,1500241000.0
75%,,,,,,,,,1.0,1512654000.0


In [5]:
df.head(5)

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Social,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",5327547126432548805,"{""continent"": ""Europe"", ""subContinent"": ""Easte...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""/yt/about/ru/"", ""campaign"": ...",1477148076,1.0,1477148000.0
1,Social,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",3580786241354367532,"{""continent"": ""Asia"", ""subContinent"": ""Western...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""/yt/about/tr/"", ""campaign"": ...",1477132413,1.0,1477132000.0
2,Social,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",9359518372269416507,"{""continent"": ""Europe"", ""subContinent"": ""South...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""/yt/about/sr/"", ""campaign"": ...",1477172134,1.0,1477172000.0
3,Social,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",268491622685377080,"{""continent"": ""Europe"", ""subContinent"": ""North...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""/yt/about/sv/"", ""campaign"": ...",1477184802,1.0,1477185000.0
4,Social,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1608015818215545613,"{""continent"": ""Asia"", ""subContinent"": ""Souther...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""/yt/about/en-GB/"", ""campaign...",1477140925,1.0,1477141000.0


In [6]:
df.shape

(1508338, 10)

In [7]:
df = df.replace({"(not set)": np.nan,
                 "(none)": np.nan,
                 "not available in demo dataset": np.nan})

In [8]:
# De misc mantenemos:
  # fullVisitorId
  # visitStartTime
  # channelGrouping

df["visitStartTime"] = pd.to_datetime(df["visitStartTime"], unit='s')
df.drop(['socialEngagementType'], axis=1, inplace=True)

In [9]:
# De device mantenemos:
  # browser
  # deviceCategory
  # operatingSystem

deviceFlatten = [flatten_json(json.loads(d)) for d in df['device']]

devi = pd.DataFrame(deviceFlatten)

# quest: language?
devi.drop(columns=['browserSize', 'browserVersion', 'flashVersion', 'language', 
    'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName',
    'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion',
    'screenColors','screenResolution'], inplace=True)

#we map our data to its corresponding value
isMobile = {False : 0, True : 1}
devi["isMobile"] = devi["isMobile"].map(isMobile)

devi.drop(columns=['isMobile'], inplace=True, axis=1)
df.drop(columns=['device'], inplace=True, axis=1)
df = df.join(devi)

In [10]:
# De geoNetwork mantenemos:
  # subcontinent
  # coutry
  # region (que contiene ciudad si no está disponible)
  # city

geoNetworkFlatten = [flatten_json(json.loads(d)) for d in df['geoNetwork']]
geo = pd.DataFrame(geoNetworkFlatten)
geo.drop(['latitude','longitude','networkLocation','cityId','continent','metro', 'networkDomain'], axis=1, inplace=True)

# TO DO: check that it works in the same way
# def region_country(geo):
#     if geo['region'] == np.nan:
#         return geo['country']
#     else:
#         return geo['region']
# geo['region']=geo.apply(region_country,axis=1)
# geo.loc[geo['city'] == "not available in demo dataset",'city'] = np.nan

geo['region'].fillna(geo['country'])

df.drop(columns=['geoNetwork'], inplace=True, axis=1)
df = df.join(geo)

In [11]:
# De totals mantenemos:
  # hits
  # transactionRevenue
  # newVisits

#Transform json
totalsList = [flatten_json(json.loads(d)) for d in df['totals']]
totals_df = pd.DataFrame(totalsList)

#Remove visits column as it does not provide any information
# Remove sessionQualityDim: An estimate of how close a particular session was to transacting, ranging
    # from 1 to 100, calculated for each session. A value closer to 1 indicates a low session quality, or
    # far from transacting, while a value closer to 100 indicates a high session quality, or very close to
    # transacting. A value of 0 indicates that Session Quality is not calculated for the selected time range.
# Remove transactionRevenue (deprecated) to use totalTransactionRevenue instead
totals_df = totals_df.drop(['visits', 'sessionQualityDim', 'transactionRevenue'], axis=1)

#Change nan in "newVisits" for 0. Binary, 1 it's new visit, 0 it's not. 
totals_df['newVisits'] = totals_df['newVisits'].fillna(0)

#Change nan in "transactionRevenue" for 0. 
totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].fillna(0)

#Change nan in "bounces" for 0
totals_df['bounces'] = totals_df['bounces'].fillna(0)

#There are some nan in pageviews but it is not trivial to find a value we can change these nan by logically.
#The corrlation with hits is 0.984, so we can keep hits that does not have any nan value. Keeping both could be redundant.
totals_df = totals_df.drop(['pageviews'], axis=1)

#Not 100% proven yet, but i have a big feeling that bounces is mainly 1 when there is only 1 hit by defenition.
#This means that even though the correlation doesn't provide a big value beetween them two, bounces is not giving much 
#additional info. Meaning using it is not relevant.
totals_df = totals_df.drop(['bounces'], axis=1)

df.drop(['totals'], inplace=True, axis=1)
df = df.join(totals_df)
# df = pd.concat([df, totals_df])

In [17]:
# De trafficSource mantenemos:
  # adHasContent
  # adPosition
  # campaign
  # isTrueDirect
  # medium
  # source

trafficFlatten = [flatten_json(json.loads(d)) for d in df['trafficSource']]
traffic = pd.DataFrame(trafficFlatten)
traffic.drop(['adwordsClickInfo_criteriaParameters', 'adwordsClickInfo_adNetworkType',
    'adwordsClickInfo_gclId', 'adwordsClickInfo_isVideoAd', 'adwordsClickInfo_page',
    'keyword', 'referralPath'], axis=1, inplace=True)

# adContent to bool adHasContent
traffic['adContent'] = traffic['adContent'].map(lambda x: 0 if str(x) == 'NoneType' else 1)

# rename to a simpler name
traffic.rename(index=str, columns={
    "adContent": "adHasContent",
    "adwordsClickInfo_slot": "adPosition"}, inplace=True)

# set (none)s and (non set)s to NaNs
# This column will contain:
    # cpc -> Cost Per Click
    # affiliate -> Affiliate marketing is a type of performance-based marketing in which a business rewards one or more affiliates for each visitor or customer brought by the affiliate's own marketing efforts.
    # cpm -> Cost per impression
# entender qué implican estos valores. ¿cpc vienen de search y cpm de webs? ¿Se juntan? ¿Importan?

# TO DO: traffic['source'] regex for the main domain (last whatevs.com), and drop less than 1000
# traffic['source'] = traffic['source'].str.extract('{"browser": "(.+?)"', expand=True)
df.drop(columns=['trafficSource'], inplace=True, axis=1)
df = df.join(traffic)

In [18]:
df = df.replace({"(not set)": np.nan,
                 "(none)": np.nan,
                 "not available in demo dataset": np.nan})

In [19]:
df.describe(include='all')

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source
count,1508338,1508338.0,1508338.0,1508338.0,1508338,1508327,1508338,1497880,626663,1506104,640963,1506104,1508338.0,1508338.0,735401.0,1508338.0,16382.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,8,1182994.0,1470043.0,,1471295,120,3,23,926,227,467,22,291.0,2.0,4649.0,7700.0,13.0,,,,,,
top,Organic Search,1.957458976293878e+18,1513124981.0,,2017-12-13 00:29:41,Chrome,desktop,Windows,Mountain View,United States,California,Northern America,1.0,1.0,5.0,0.0,1.0,,,,,,
freq,651980,342.0,28.0,,28,1035956,1033252,547046,65105,634591,182335,679631,763842.0,1154448.0,8493.0,1491991.0,15944.0,,,,,,
first,,,,,2016-08-01 07:00:12,,,,,,,,,,,,,,,,,,
last,,,,,2018-05-01 06:56:58,,,,,,,,,,,,,,,,,,
mean,,,,2.334044,,,,,,,,,,,,,,,,,,,
std,,,,9.35342,,,,,,,,,,,,,,,,,,,
min,,,,1.0,,,,,,,,,,,,,,,,,,,
25%,,,,1.0,,,,,,,,,,,,,,,,,,,


In [20]:
df.head(5)

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source
0,Social,5327547126432548805,1477148076,1.0,2016-10-22 14:54:36,Safari,desktop,Macintosh,,Russia,,Eastern Europe,1,1,,0,,,,,,,
1,Social,3580786241354367532,1477132413,1.0,2016-10-22 10:33:33,Safari,desktop,Macintosh,,Turkey,,Western Asia,1,1,,0,,,,,,,
2,Social,9359518372269416507,1477172134,1.0,2016-10-22 21:35:34,Chrome,desktop,Windows,,Serbia,,Southern Europe,1,1,,0,,,,,,,
3,Social,268491622685377080,1477184802,1.0,2016-10-23 01:06:42,Chrome,desktop,Windows,,Sweden,,Northern Europe,1,1,,0,,,,,,,
4,Social,1608015818215545613,1477140925,1.0,2016-10-22 12:55:25,Chrome,desktop,Windows,Bengaluru,India,Karnataka,Southern Asia,1,1,,0,,,,,,,


In [21]:
df[df['isTrueDirect'].isna()==False]

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source


In [22]:
df.to_csv("../data/train_v2_cleaned.csv")
# df.to_pickle("../data/train_v2_cleaned.pkl")