In [1]:
import pandas as pd
import glob
import json
import numpy as np

pd.set_option('display.max_columns', 30)

In [2]:
# recursive flatten function from https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
# Call example:
  # flattenCol = [flatten_json(json.loads(d)) for d in df['trafficSource']]
  # dataFrameOfFlattenCol = pd.DataFrame(flattenCol)
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [3]:
allFiles = glob.glob("../data/splitted/*.csv")
df = pd.DataFrame({'channelGrouping': [],
                    'device': [],
                    'fullVisitorId': [],
                    'geoNetwork': [],
                    'socialEngagementType': [],
                    'totals': [],
                    'trafficSource': [],
                    'visitId': [],
                    'visitNumber': [],
                    'visitStartTime': []})

for fl in allFiles:
    temp = pd.read_csv(fl, usecols=["channelGrouping", "customDimensions", "date", "device", "fullVisitorId", "geoNetwork", "hits",
                                      "socialEngagementType", "totals", "trafficSource", "visitId", "visitNumber",
                                      "visitStartTime"], header=0, dtype={'fullVisitorId': 'str', 'visitId': 'str'})
    # we leave out hits, customDimensions and date
    temp.drop(columns=["hits", "customDimensions", "date"], axis=1, inplace=True)
    df = df.append(temp, ignore_index=True)

In [4]:
df.describe(include='all')

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
count,299998,299998,299998.0,299998,299998,299998,299998,299998.0,299998.0,299998.0
unique,8,158,262775.0,21938,1,59988,16425,289633.0,,
top,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1.957458976293878e+18,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1513124981.0,,
freq,127120,88673,83.0,23064,299998,64219,73007,28.0,,
mean,,,,,,,,,2.304489,1497589000.0
std,,,,,,,,,9.343674,16244280.0
min,,,,,,,,,1.0,1470380000.0
25%,,,,,,,,,1.0,1481307000.0
50%,,,,,,,,,1.0,1498062000.0
75%,,,,,,,,,1.0,1511830000.0


In [5]:
df.head(5)

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",538928163114544921,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""9""...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1496074043,1.0,1496074000.0
1,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5863936343673668667,"{""continent"": ""Asia"", ""subContinent"": ""Souther...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""12...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1496087011,2.0,1496087000.0
2,Organic Search,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",9993197829693068679,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""10...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1496099391,9.0,1496099000.0
3,Referral,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",635666386021193101,"{""continent"": ""Asia"", ""subContinent"": ""Eastern...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""16"", ""pageviews"": ""16...","{""referralPath"": ""/analytics/web/"", ""campaign""...",1496049643,13.0,1496050000.0
4,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7772696018151280169,"{""continent"": ""Europe"", ""subContinent"": ""South...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""16"", ""pageviews"": ""12...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1496053146,1.0,1496053000.0


In [6]:
df.shape

(299998, 10)

In [19]:
df = df.replace({"(not set)": np.nan,
                 "(none)": np.nan,
                 "not available in demo dataset": np.nan})

In [8]:
# De misc mantenemos:
  # fullVisitorId
  # visitStartTime
  # channelGrouping

df["visitStartTime"] = pd.to_datetime(df["visitStartTime"], unit='s')
df.drop(['socialEngagementType'], axis=1, inplace=True)

In [9]:
# De device mantenemos:
  # browser
  # deviceCategory
  # operatingSystem

deviceFlatten = [flatten_json(json.loads(d)) for d in df['device']]

devi = pd.DataFrame(deviceFlatten)

# quest: language?
devi.drop(columns=['browserSize', 'browserVersion', 'flashVersion', 'language', 
    'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName',
    'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion',
    'screenColors','screenResolution'], inplace=True)

#we map our data to its corresponding value
isMobile = {False : 0, True : 1}
devi["isMobile"] = devi["isMobile"].map(isMobile)

devi.drop(columns=['isMobile'], inplace=True, axis=1)
df.drop(columns=['device'], inplace=True, axis=1)
df = df.join(devi)

In [10]:
# De geoNetwork mantenemos:
  # subcontinent
  # coutry
  # region (que contiene ciudad si no está disponible)
  # city

geoNetworkFlatten = [flatten_json(json.loads(d)) for d in df['geoNetwork']]
geo = pd.DataFrame(geoNetworkFlatten)
geo.drop(['latitude','longitude','networkLocation','cityId','continent','metro', 'networkDomain'], axis=1, inplace=True)

# TO DO: check that it works in the same way
# def region_country(geo):
#     if geo['region'] == np.nan:
#         return geo['country']
#     else:
#         return geo['region']
# geo['region']=geo.apply(region_country,axis=1)
# geo.loc[geo['city'] == "not available in demo dataset",'city'] = np.nan

geo['region'].fillna(geo['country'])

df.drop(columns=['geoNetwork'], inplace=True, axis=1)
df = df.join(geo)

In [11]:
# De totals mantenemos:
  # hits
  # transactionRevenue
  # newVisits

#Transform json
totalsList = [flatten_json(json.loads(d)) for d in df['totals']]
totals_df = pd.DataFrame(totalsList)

#Remove visits column as it does not provide any information
# Remove sessionQualityDim: An estimate of how close a particular session was to transacting, ranging
    # from 1 to 100, calculated for each session. A value closer to 1 indicates a low session quality, or
    # far from transacting, while a value closer to 100 indicates a high session quality, or very close to
    # transacting. A value of 0 indicates that Session Quality is not calculated for the selected time range.
# Remove transactionRevenue (deprecated) to use totalTransactionRevenue instead
totals_df = totals_df.drop(['visits', 'sessionQualityDim', 'transactionRevenue'], axis=1)

#Change nan in "newVisits" for 0. Binary, 1 it's new visit, 0 it's not. 
totals_df['newVisits'] = totals_df['newVisits'].fillna(0)

#Change nan in "transactionRevenue" for 0. 
totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].fillna(0)

#Change nan in "bounces" for 0
totals_df['bounces'] = totals_df['bounces'].fillna(0)

#There are some nan in pageviews but it is not trivial to find a value we can change these nan by logically.
#The corrlation with hits is 0.984, so we can keep hits that does not have any nan value. Keeping both could be redundant.
totals_df = totals_df.drop(['pageviews'], axis=1)

#Not 100% proven yet, but i have a big feeling that bounces is mainly 1 when there is only 1 hit by defenition.
#This means that even though the correlation doesn't provide a big value beetween them two, bounces is not giving much 
#additional info. Meaning using it is not relevant.
totals_df = totals_df.drop(['bounces'], axis=1)

df.drop(['totals'], inplace=True, axis=1)
df = df.join(totals_df)
# df = pd.concat([df, totals_df])

In [12]:
# De trafficSource mantenemos:
  # adHasContent
  # adPosition
  # campaign
  # isTrueDirect
  # medium
  # source

trafficFlatten = [flatten_json(json.loads(d)) for d in df['trafficSource']]
traffic = pd.DataFrame(trafficFlatten)
traffic.drop(['adwordsClickInfo_criteriaParameters', 'adwordsClickInfo_adNetworkType',
    'adwordsClickInfo_gclId', 'adwordsClickInfo_isVideoAd', 'adwordsClickInfo_page',
    'keyword', 'referralPath', 'campaignCode'], axis=1, inplace=True)

# adContent to bool adHasContent
traffic['adContent'] = traffic['adContent'].map(lambda x: 0 if str(x) == 'NoneType' else 1)

# rename to a simpler name
traffic.rename(index=str, columns={
    "adContent": "adHasContent",
    "adwordsClickInfo_slot": "adPosition"}, inplace=True)

# set (none)s and (non set)s to NaNs
# This column will contain:
    # cpc -> Cost Per Click
    # affiliate -> Affiliate marketing is a type of performance-based marketing in which a business rewards one or more affiliates for each visitor or customer brought by the affiliate's own marketing efforts.
    # cpm -> Cost per impression
# entender qué implican estos valores. ¿cpc vienen de search y cpm de webs? ¿Se juntan? ¿Importan?

# TO DO: traffic['source'] regex for the main domain (last whatevs.com), and drop less than 1000
# traffic['source'] = traffic['source'].str.extract('{"browser": "(.+?)"', expand=True)
df.drop(columns=['trafficSource'], inplace=True, axis=1)
df = df.join(traffic)

In [21]:
df = df.replace({"(not set)": np.nan,
                 "(none)": np.nan,
                 "not available in demo dataset": np.nan})

In [13]:
df.describe(include='all')

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source
count,299998,299998.0,299998.0,299998.0,299998,299998,299998,299998,299998,299998,299998,299998,299998.0,299998.0,144892.0,299998.0,3167.0,0.0,0.0,0.0,0.0,0.0,0.0
unique,8,262775.0,289633.0,,289671,50,3,23,526,206,354,23,194.0,2.0,3187.0,2193.0,6.0,,0.0,0.0,0.0,0.0,0.0
top,Organic Search,1.957458976293878e+18,1513124981.0,,2017-12-13 00:29:41,Chrome,desktop,Windows,not available in demo dataset,United States,not available in demo dataset,Northern America,1.0,1.0,4.0,0.0,1.0,,,,,,
freq,127120,83.0,28.0,,28,204515,204996,108558,163654,124336,163654,133306,153332.0,231521.0,2067.0,296844.0,3066.0,,,,,,
first,,,,,2016-08-05 07:00:08,,,,,,,,,,,,,,,,,,
last,,,,,2018-04-30 06:59:53,,,,,,,,,,,,,,,,,,
mean,,,,2.304489,,,,,,,,,,,,,,,,,,,
std,,,,9.343674,,,,,,,,,,,,,,,,,,,
min,,,,1.0,,,,,,,,,,,,,,,,,,,
25%,,,,1.0,,,,,,,,,,,,,,,,,,,


In [14]:
df.head(5)

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source
0,Organic Search,538928163114544921,1496074043,1.0,2017-05-29 16:07:23,Chrome,tablet,Android,not available in demo dataset,United States,not available in demo dataset,Northern America,15,1,178,0,,,,,,,
1,Organic Search,5863936343673668667,1496087011,2.0,2017-05-29 19:43:31,Chrome,desktop,Windows,Bengaluru,India,Karnataka,Southern Asia,15,0,2289,0,,,,,,,
2,Organic Search,9993197829693068679,1496099391,9.0,2017-05-29 23:09:51,Safari,mobile,iOS,San Jose,United States,California,Northern America,15,0,340,0,,,,,,,
3,Referral,635666386021193101,1496049643,13.0,2017-05-29 09:20:43,Chrome,desktop,Windows,Osaka,Japan,Osaka Prefecture,Eastern Asia,16,0,1880,0,,,,,,,
4,Organic Search,7772696018151280169,1496053146,1.0,2017-05-29 10:19:06,Chrome,desktop,Windows,not available in demo dataset,Spain,not available in demo dataset,Southern Europe,16,1,560,0,,,,,,,


In [29]:
df[df['isTrueDirect'].isna()==False]

Unnamed: 0,channelGrouping,fullVisitorId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,hits,newVisits,timeOnSite,totalTransactionRevenue,transactions,adHasContent,adPosition,campaign,isTrueDirect,medium,source


In [None]:
# df.to_csv("../data/train_v2_cleaned.csv")
# df.to_pickle("../data/train_v2_cleaned.pkl")