## Turn this into a good-looking-informative-cell
#### The output .csv file will contain the following fields
    - channelGrouping
    - fullVisitorId
    - visitId
    - visitNumber
    - visitStartTime
    - browser
    - deviceCategory
    - operatingSystem
    - city
    - country
    - region
    - subContinent
    - hits
    - newVisits
    - *timeOnSite*
    - totalTransactionRevenue
    - *transactions dividir esto*
    - adHasContent
    - adPosition
    - campaign
    - isTrueDirect
    - medium
    - source

### Funciones usadas en el Notebook

In [1]:
# recursive flatten function from https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
# Call example:
  # flattenCol = [flatten_json(json.loads(d)) for d in df['trafficSource']]
  # dataFrameOfFlattenCol = pd.DataFrame(flattenCol)
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

def region_country(geo):
    if geo['region'] == '(not set)':
        return geo['country']
    elif geo['region'] == 'not available in demo dataset':
        return geo['country']
    else:
        return geo['region']

### Importación de librerías y datos (separándolos a través de comandos bash)

In [2]:
import pandas as pd
import glob
import json
import numpy as np

pd.set_option('display.max_columns', 30)

In [3]:
allFiles = glob.glob("../data/splitted_train/*.csv")
df = pd.DataFrame({'channelGrouping': [],
                    'device': [],
                    'fullVisitorId': [],
                    'geoNetwork': [],
                    'socialEngagementType': [],
                    'totals': [],
                    'trafficSource': [],
                    'visitNumber': [],
                    'visitStartTime': []})

for fl in allFiles:
    temp = pd.read_csv(fl, usecols=["channelGrouping", "date", "device", "fullVisitorId", "geoNetwork",
                                    "socialEngagementType", "totals", "trafficSource", "visitId", "visitNumber",
                                    "visitStartTime"], header=0, dtype={'fullVisitorId': 'str'})
    # we leave out hits, customDimensions and date
    temp.drop(columns=["date", "visitId"], axis=1, inplace=True)
    df = df.append(temp, ignore_index=True)

In [4]:
df.describe(include='all')

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
count,1708337,1708337,1708337.0,1708337,1708337,1708337,1708337,1708337.0,1708337.0
unique,8,298,1323730.0,59955,1,252374,76637,,
top,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1.957458976293878e+18,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",,
freq,738963,504589,400.0,132962,1708337,350177,460901,,
mean,,,,,,,,2.33517,1498352000.0
std,,,,,,,,9.354034,16249370.0
min,,,,,,,,1.0,1470035000.0
25%,,,,,,,,1.0,1482738000.0
50%,,,,,,,,1.0,1499832000.0
75%,,,,,,,,1.0,1512513000.0


In [5]:
df.head(5)

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
0,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3623819892424331961,"{""continent"": ""Europe"", ""subContinent"": ""Easte...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""13...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1.0,1496073000.0
1,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",538928163114544921,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""9""...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1.0,1496074000.0
2,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5863936343673668667,"{""continent"": ""Asia"", ""subContinent"": ""Souther...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""12...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",2.0,1496087000.0
3,Organic Search,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",9993197829693068679,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""10...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",9.0,1496099000.0
4,Referral,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",635666386021193101,"{""continent"": ""Asia"", ""subContinent"": ""Eastern...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""16"", ""pageviews"": ""16...","{""referralPath"": ""/analytics/web/"", ""campaign""...",13.0,1496050000.0


In [6]:
df.shape

(1708337, 9)

In [7]:
# df = df.replace({
# MERGE    "(not set)": np.nan,                  "(none)": np.nan,
#               KEEP   "not available in demo dataset": np.nan})

De las columnas unidimensionales mantenemos:
- fullVisitorId
- visitStartTime
- channelGrouping

In [8]:
df["visitStartTime"] = pd.to_datetime(df["visitStartTime"], unit='s')
df.drop(['socialEngagementType'], axis=1, inplace=True)

De la columna 'device' mantenemos:
- browser
- deviceCategory
- operatingSystem

In [9]:
deviceFlatten = [flatten_json(json.loads(d)) for d in df['device']]
devi = pd.DataFrame(deviceFlatten)

devi.drop(columns=['browserSize', 'browserVersion', 'flashVersion', 'language', 
    'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName',
    'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion',
    'screenColors','screenResolution'], inplace=True)

#we map our data to its corresponding value
isMobile = {False : 0, True : 1}
devi["isMobile"] = devi["isMobile"].map(isMobile)

devi.drop(columns=['isMobile'], inplace=True, axis=1)
df.drop(columns=['device'], inplace=True, axis=1)
df = df.join(devi)

De la columna 'geoNetwork' mantenemos:
- subcontinent
- coutry
- region (que contiene ciudad si no está disponible)
- city

In [10]:
geoNetworkFlatten = [flatten_json(json.loads(d)) for d in df['geoNetwork']]
geo = pd.DataFrame(geoNetworkFlatten)
geo.drop(['latitude','longitude','networkLocation','cityId','continent','metro', 'networkDomain'], axis=1, inplace=True)

geo['region']=geo.apply(region_country,axis=1)
# geo.loc[geo['city'] == "not available in demo dataset",'city'] = np.nan
# geo['region'].fillna(geo['country'])

df.drop(columns=['geoNetwork'], inplace=True, axis=1)
df = df.join(geo)

De la columna 'trafficSource' mantenemos:
- adContent
- adPosition
- campaign
- isTrueDirect
- medium
- source

In [11]:
trafficFlatten = [flatten_json(json.loads(d)) for d in df['trafficSource']]
traffic = pd.DataFrame(trafficFlatten)
traffic.drop(['adwordsClickInfo_criteriaParameters', 'adwordsClickInfo_adNetworkType',
    'adwordsClickInfo_gclId', 'adwordsClickInfo_isVideoAd', 'adwordsClickInfo_page',
    'keyword', 'referralPath'], axis=1, inplace=True)

# rename to a simpler name
traffic.rename(columns={"adwordsClickInfo_slot": "adPosition"}, inplace=True)

# TO DO: traffic['source'] regex for the main domain (last whatevs.com), and drop less than 1000
# traffic['source'] = traffic['source'].str.extract('{"browser": "(.+?)"', expand=True)

df.drop(columns=['trafficSource'], inplace=True, axis=1)
df = df.join(traffic)

De la columna 'totals' mantenemos:
- hits
- transactionRevenue
- newVisits

In [12]:
#Transform json
totalsList = [flatten_json(json.loads(d)) for d in df['totals']]
totals_df = pd.DataFrame(totalsList)

#Remove visits column as it does not provide any information
# Remove sessionQualityDim: An estimate of how close a particular session was to transacting, ranging
    # from 1 to 100, calculated for each session. A value closer to 1 indicates a low session quality, or
    # far from transacting, while a value closer to 100 indicates a high session quality, or very close to
    # transacting. A value of 0 indicates that Session Quality is not calculated for the selected time range.
# Remove transactionRevenue (deprecated) to use totalTransactionRevenue instead
totals_df = totals_df.drop(['visits', 'sessionQualityDim', 'transactionRevenue', 'hits', 'newVisits'], axis=1)

#Change nan in "transactionRevenue" for 0. 
totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].fillna(0)

#Change nan in "bounces" for 0
totals_df['bounces'] = totals_df['bounces'].fillna(0)

#There are some nan in pageviews but it is not trivial to find a value we can change these nan by logically.
#The corrlation with hits is 0.984, so we can keep hits that does not have any nan value. Keeping both could be redundant.
totals_df = totals_df.drop(['pageviews'], axis=1)

#Not 100% proven yet, but i have a big feeling that bounces is mainly 1 when there is only 1 hit by defenition.
#This means that even though the correlation doesn't provide a big value beetween them two, bounces is not giving much 
#additional info. Meaning using it is not relevant.
totals_df = totals_df.drop(['bounces'], axis=1)

totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].apply(lambda x: float(x)/1000000)

df.drop(['totals'], inplace=True, axis=1)
df = df.join(totals_df)

In [13]:
traffic.shape

(1708337, 7)

In [14]:
df.shape

(1708337, 21)

In [15]:
# df = df.replace({"(not set)": np.nan,
#                  "(none)": np.nan,
#                  "not available in demo dataset": np.nan})

In [16]:
list(df)

['channelGrouping',
 'fullVisitorId',
 'visitNumber',
 'visitStartTime',
 'browser',
 'deviceCategory',
 'operatingSystem',
 'city',
 'country',
 'region',
 'subContinent',
 'adContent',
 'adPosition',
 'campaign',
 'campaignCode',
 'isTrueDirect',
 'medium',
 'source',
 'timeOnSite',
 'totalTransactionRevenue',
 'transactions']

In [17]:
print(df.shape)

(1708337, 21)


In [18]:
df.describe(include='all')

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,campaignCode,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions
count,1708337,1708337.0,1708337.0,1708337,1708337,1708337,1708337,1708337,1708337,1708337,1708337,64737,75274,1708337,1,534518,1708337,1708337,834043.0,1708337.0,18559.0
unique,8,1323730.0,,1667423,129,3,24,956,228,707,23,76,3,33,1,1,7,345,4774.0,,13.0
top,Organic Search,1.957458976293878e+18,,2017-12-13 00:31:38,Chrome,desktop,Windows,not available in demo dataset,United States,United States,Northern America,Google Merchandise Store,RHS,(not set),11251kjhkvahf,True,organic,google,5.0,,1.0
freq,738963,400.0,,28,1173056,1171579,619720,932959,717217,358714,768345,39566,42750,1604526,1,534518,591783,658384,9862.0,,18048.0
first,,,,2016-08-01 07:00:12,,,,,,,,,,,,,,,,,
last,,,,2018-05-01 06:56:58,,,,,,,,,,,,,,,,,
mean,,,2.33517,,,,,,,,,,,,,,,,,1.547767,
std,,,9.354034,,,,,,,,,,,,,,,,,68.81097,
min,,,1.0,,,,,,,,,,,,,,,,,0.0,
25%,,,1.0,,,,,,,,,,,,,,,,,0.0,


In [None]:
df.head(5)

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,campaignCode,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions
0,Organic Search,3623819892424331961,1.0,2017-05-29 15:56:49,Chrome,mobile,Android,not available in demo dataset,Czechia,Czechia,Eastern Europe,,,(not set),,,organic,google,284,0.0,
1,Organic Search,538928163114544921,1.0,2017-05-29 16:07:23,Chrome,tablet,Android,not available in demo dataset,United States,United States,Northern America,,,(not set),,,organic,google,178,0.0,
2,Organic Search,5863936343673668667,2.0,2017-05-29 19:43:31,Chrome,desktop,Windows,Bengaluru,India,Karnataka,Southern Asia,,,(not set),,True,organic,google,2289,0.0,
3,Organic Search,9993197829693068679,9.0,2017-05-29 23:09:51,Safari,mobile,iOS,San Jose,United States,California,Northern America,,,(not set),,True,organic,google,340,0.0,
4,Referral,635666386021193101,13.0,2017-05-29 09:20:43,Chrome,desktop,Windows,Osaka,Japan,Osaka Prefecture,Eastern Asia,,,(not set),,True,referral,analytics.google.com,1880,0.0,


In [None]:
# df.to_csv("../data/train_v2_cleaned.csv")
# df.to_pickle("../data/train_v2_cleaned.pkl")

In [24]:
graDF =df.drop(['fullVisitorId', 'campaignCode'], axis=1) 

In [25]:
df.to_csv("../data/train_v2_graphext.csv")