## Limpieza de los datos
Se toman los archivos ```train.csv``` y ```test.csv``` divididos en fragmentos de 10.000 líneas cada uno. Estos archivos se encuentran en la carpeta ```../data/splitted_train/``` y ```../data/splitted_test/```, respectivamente.

La salida contendrá las siguientes columnas:
    * fullVisitorIdv - A unique identifier for each user of the Google Merchandise Store. 
    * channelGrouping - The channel via which the user came to the Store.
    * date - The date on which the user visited the Store.
    * device - The specifications for the device used to access the Store.
        - browser
        - deviceCategory
        - operatingSystem
    * geoNetwork - This section contains information about the geography of the user.
        - city
        - country
        - region
        - subContinent
    * sessionId - A unique identifier for this visit to the store.
    * socialEngagementType - Engagement type, either "Socially Engaged" or "Not Socially Engaged".
    * totals - This section contains aggregate values across the session
        - timeOnSite
        - totalTransactionRevenue
        - transactions
    * trafficSource - This section contains information about the Traffic Source from which the session originated.
        - adContent
        - adPosition
        - campaign
        - isTrueDirect
        - medium
        - source
    * visitNumber - The session number for this user. If this is the first session, then this is set to 1.
    * visitStartTime - The timestamp (expressed as POSIX time).
    * prevPurchases - 1 if the user has already bought in out store

### Funciones y librerías usadas en el Notebook

In [1]:
import pandas as pd
import glob
import json
import numpy as np

pd.set_option('display.max_columns', 30)

In [2]:
# recursive flatten function from https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
# Call example:
  # flattenCol = [flatten_json(json.loads(d)) for d in df['trafficSource']]
  # dataFrameOfFlattenCol = pd.DataFrame(flattenCol)
def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

# fill the non available regions with the country name
def region_country(geo):
    if geo['region'] == '(not set)':
        return geo['country']
    elif geo['region'] == 'not available in demo dataset':
        return geo['country']
    else:
        return geo['region']

bought = []
def prev_purchases(df):
    #  modify the global array to transfer information between train and test
    global bought
    prevPurchases = np.zeros(df.shape[0])
    clientBuys = df['totalTransactionRevenue'] > 0
    for r in range(df.shape[0]):
        a = df.iloc[r]
        if clientBuys[r] and a['fullVisitorId'] not in bought:
            bought.append(a['fullVisitorId'])
            prevPurchases[r] = 0
        elif clientBuys[r] and a['fullVisitorId'] in bought:
            prevPurchases[r] = prevPurchases[r] + 1;
        else:
            prevPurchases[r] = 0
    return prevPurchases

### Limpieza de train

In [3]:
allFiles = glob.glob("../data/splitted_train/*.csv")
df = pd.DataFrame({'channelGrouping': [],
                    'device': [],
                    'fullVisitorId': [],
                    'geoNetwork': [],
                    'socialEngagementType': [],
                    'totals': [],
                    'trafficSource': [],
                    'visitNumber': [],
                    'visitStartTime': []})

for fl in allFiles:
    # we leave out hits, customDimensions, visitId and date
    temp = pd.read_csv(fl, usecols=["channelGrouping", "device", "fullVisitorId", "geoNetwork",
                                    "socialEngagementType", "totals", "trafficSource", "visitNumber",
                                    "visitStartTime"], header=0, dtype={'fullVisitorId': 'str'})
    df = df.append(temp, ignore_index=True)

In [4]:
df.describe(include='all')

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
count,1708337,1708337,1708337.0,1708337,1708337,1708337,1708337,1708337.0,1708337.0
unique,8,298,1323730.0,59955,1,252374,76637,,
top,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",1.957458976293878e+18,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",,
freq,738963,504589,400.0,132962,1708337,350177,460901,,
mean,,,,,,,,2.33517,1498352000.0
std,,,,,,,,9.354034,16249370.0
min,,,,,,,,1.0,1470035000.0
25%,,,,,,,,1.0,1482738000.0
50%,,,,,,,,1.0,1499832000.0
75%,,,,,,,,1.0,1512513000.0


In [5]:
df.head(5)

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
0,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3623819892424331961,"{""continent"": ""Europe"", ""subContinent"": ""Easte...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""13...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1.0,1496073000.0
1,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",538928163114544921,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""9""...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1.0,1496074000.0
2,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5863936343673668667,"{""continent"": ""Asia"", ""subContinent"": ""Souther...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""12...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",2.0,1496087000.0
3,Organic Search,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",9993197829693068679,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""15"", ""pageviews"": ""10...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",9.0,1496099000.0
4,Referral,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",635666386021193101,"{""continent"": ""Asia"", ""subContinent"": ""Eastern...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""16"", ""pageviews"": ""16...","{""referralPath"": ""/analytics/web/"", ""campaign""...",13.0,1496050000.0


In [6]:
df.shape

(1708337, 9)

De las columnas unidimensionales mantenemos:
- fullVisitorId
- visitStartTime
- channelGrouping

In [7]:
df["visitStartTime"] = pd.to_datetime(df["visitStartTime"], unit='s')
df.drop(['socialEngagementType'], axis=1, inplace=True)

De la columna 'device' mantenemos:
- browser
- deviceCategory
- operatingSystem

In [8]:
deviceFlatten = [flatten_json(json.loads(d)) for d in df['device']]
devi = pd.DataFrame(deviceFlatten)

devi.drop(columns=['browserSize', 'browserVersion', 'flashVersion', 'language', 
    'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName',
    'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion',
    'screenColors','screenResolution'], inplace=True)

# we map our data to its corresponding value
isMobile = {False : 0, True : 1}
devi["isMobile"] = devi["isMobile"].map(isMobile)

devi.drop(columns=['isMobile'], inplace=True, axis=1)
df.drop(columns=['device'], inplace=True, axis=1)
df = df.join(devi)

De la columna 'geoNetwork' mantenemos:
- subcontinent
- coutry
- region (que contiene ciudad si no está disponible)
- city

In [9]:
geoNetworkFlatten = [flatten_json(json.loads(d)) for d in df['geoNetwork']]
geo = pd.DataFrame(geoNetworkFlatten)
geo.drop(['latitude','longitude','networkLocation','cityId','continent','metro', 'networkDomain'], axis=1, inplace=True)

geo['region'] = geo.apply(region_country,axis=1)

df.drop(columns=['geoNetwork'], inplace=True, axis=1)
df = df.join(geo)

De la columna 'trafficSource' mantenemos:
- adContent
- adPosition
- campaign
- isTrueDirect
- medium
- source

In [10]:
trafficFlatten = [flatten_json(json.loads(d)) for d in df['trafficSource']]
traffic = pd.DataFrame(trafficFlatten)
traffic.drop(['adwordsClickInfo_criteriaParameters', 'adwordsClickInfo_adNetworkType',
    'adwordsClickInfo_gclId', 'adwordsClickInfo_isVideoAd', 'adwordsClickInfo_page',
    'keyword', 'referralPath', 'campaignCode'], axis=1, inplace=True)

# rename to a simpler name
traffic.rename(columns={"adwordsClickInfo_slot": "adPosition"}, inplace=True)

# possible regex transformation on traffic['source'] for storing the main domain (last whatevs.com)
# traffic['source'] = traffic['source'].str.extract('{"browser": "(.+?)"', expand=True)

df.drop(columns=['trafficSource'], inplace=True, axis=1)
df = df.join(traffic)

De la columna 'totals' mantenemos:
- hits
- transactionRevenue

In [11]:
#Transform json
totalsList = [flatten_json(json.loads(d)) for d in df['totals']]
totals_df = pd.DataFrame(totalsList)

# Remove visits column as it does not provide any information
# Remove sessionQualityDim: An estimate of how close a particular session was to transacting, ranging
    # from 1 to 100, calculated for each session. A value closer to 1 indicates a low session quality, or
    # far from transacting, while a value closer to 100 indicates a high session quality, or very close to
    # transacting. A value of 0 indicates that Session Quality is not calculated for the selected time range
# Remove transactionRevenue (deprecated) to use totalTransactionRevenue instead
totals_df = totals_df.drop(['visits', 'sessionQualityDim', 'transactionRevenue', 'hits', 'newVisits'], axis=1)

# Change nan in "transactionRevenue" and "bounces" for 0 
totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].fillna(0)
totals_df['bounces'] = totals_df['bounces'].fillna(0)

# There are some nan in pageviews but it is not trivial to find a value we can change these nan by logically.
# The corrlation with hits is 0.984, so we can keep hits that does not have any nan value. Keeping both could be redundant
totals_df = totals_df.drop(['pageviews'], axis=1)

# Bounces is mainly 1 when there is only 1 hit by defenition.
# This means that even though the correlation doesn't provide a big value beetween them two, bounces is not giving much 
# additional info
totals_df = totals_df.drop(['bounces'], axis=1)

totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].apply(lambda x: float(x)/1000000)

df.drop(['totals'], inplace=True, axis=1)
df = df.join(totals_df)

Creamos una columna extra indicando si el cliente ha comprado previamente o no

In [12]:
df = df.sort_values(by=['visitStartTime']).reset_index(drop=True)
df['prevPurchases'] = prev_purchases(df)

In [13]:
# df = df.replace({"(not set)": np.nan,
#                  "(none)": np.nan,
#                  "not available in demo dataset": np.nan})

In [14]:
print(df.shape)
print(list(df))

(1708337, 21)
['channelGrouping', 'fullVisitorId', 'visitNumber', 'visitStartTime', 'browser', 'deviceCategory', 'operatingSystem', 'city', 'country', 'region', 'subContinent', 'adContent', 'adPosition', 'campaign', 'isTrueDirect', 'medium', 'source', 'timeOnSite', 'totalTransactionRevenue', 'transactions', 'prevPurchases']


In [15]:
df.describe(include='all')

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
count,1708337,1708337.0,1708337.0,1708337,1708337,1708337,1708337,1708337,1708337,1708337,1708337,64737,75274,1708337,534518,1708337,1708337,834043.0,1708337.0,18559.0,1708337.0
unique,8,1323730.0,,1667423,129,3,24,956,228,707,23,76,3,33,1,7,345,4774.0,,13.0,
top,Organic Search,1.957458976293878e+18,,2017-12-13 00:29:41,Chrome,desktop,Windows,not available in demo dataset,United States,United States,Northern America,Google Merchandise Store,RHS,(not set),True,organic,google,5.0,,1.0,
freq,738963,400.0,,28,1173056,1171579,619720,932959,717217,358714,768345,39566,42750,1604526,534518,591783,658384,9862.0,,18048.0,
first,,,,2016-08-01 07:00:12,,,,,,,,,,,,,,,,,
last,,,,2018-05-01 06:56:58,,,,,,,,,,,,,,,,,
mean,,,2.33517,,,,,,,,,,,,,,,,1.547767,,0.00138907
std,,,9.354034,,,,,,,,,,,,,,,,68.81097,,0.03724435
min,,,1.0,,,,,,,,,,,,,,,,0.0,,0.0
25%,,,1.0,,,,,,,,,,,,,,,,0.0,,0.0


In [16]:
df.to_csv("../data/train_v2_cleaned.csv")
df.to_pickle("../data/train_v2_cleaned.pkl")

### Limpieza de test
Repetimos en test.csv los pasos hechos sobre train

In [17]:
allFiles = glob.glob("../data/splitted_test/*.csv")
df = pd.DataFrame({'channelGrouping': [],
                    'device': [],
                    'fullVisitorId': [],
                    'geoNetwork': [],
                    'socialEngagementType': [],
                    'totals': [],
                    'trafficSource': [],
                    'visitNumber': [],
                    'visitStartTime': []})

for fl in allFiles:
    # we leave out hits, customDimensions, visitId and date
    temp = pd.read_csv(fl, usecols=["channelGrouping", "device", "fullVisitorId", "geoNetwork",
                                    "socialEngagementType", "totals", "trafficSource", "visitNumber",
                                    "visitStartTime"], header=0, dtype={'fullVisitorId': 'str'})
    df = df.append(temp, ignore_index=True)

In [18]:
df.describe(include='all')

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
count,401589,401589,401589.0,401589,401589,401589,401589,401589.0,401589.0
unique,8,164,296530.0,22905,1,97778,12963,,
top,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6.501071168742028e+17,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""1"", ""pageviews"": ""1"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",,
freq,198378,118612,105.0,33765,401589,133919,149584,,
mean,,,,,,,,2.486104,1532128000.0
std,,,,,,,,10.699105,4309457.0
min,,,,,,,,1.0,1525158000.0
25%,,,,,,,,1.0,1528266000.0
50%,,,,,,,,1.0,1531978000.0
75%,,,,,,,,2.0,1535926000.0


In [19]:
df.head(5)

Unnamed: 0,channelGrouping,device,fullVisitorId,geoNetwork,socialEngagementType,totals,trafficSource,visitNumber,visitStartTime
0,Direct,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3404236376816187578,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""21"", ""pageviews"": ""16...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1.0,1536607000.0
1,Direct,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",4914017278166893777,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""21"", ""pageviews"": ""19...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",2.0,1536603000.0
2,Organic Search,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6232847580219746322,"{""continent"": ""Europe"", ""subContinent"": ""Easte...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""21"", ""pageviews"": ""14...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",2.0,1536593000.0
3,Direct,"{""browser"": ""Android Webview"", ""browserVersion...",67325434172403072,"{""continent"": ""Asia"", ""subContinent"": ""Eastern...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""21"", ""pageviews"": ""17...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1.0,1536623000.0
4,Direct,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",180007823412001644,"{""continent"": ""Europe"", ""subContinent"": ""Weste...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""21"", ""pageviews"": ""13...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1.0,1536590000.0


In [20]:
df.shape

(401589, 9)

In [21]:
df["visitStartTime"] = pd.to_datetime(df["visitStartTime"], unit='s')
df.drop(['socialEngagementType'], axis=1, inplace=True)

In [22]:
deviceFlatten = [flatten_json(json.loads(d)) for d in df['device']]
devi = pd.DataFrame(deviceFlatten)

devi.drop(columns=['browserSize', 'browserVersion', 'flashVersion', 'language', 
    'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName',
    'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion',
    'screenColors','screenResolution'], inplace=True)

#we map our data to its corresponding value
isMobile = {False : 0, True : 1}
devi["isMobile"] = devi["isMobile"].map(isMobile)

devi.drop(columns=['isMobile'], inplace=True, axis=1)
df.drop(columns=['device'], inplace=True, axis=1)
df = df.join(devi)

In [23]:
geoNetworkFlatten = [flatten_json(json.loads(d)) for d in df['geoNetwork']]
geo = pd.DataFrame(geoNetworkFlatten)
geo.drop(['latitude','longitude','networkLocation','cityId','continent','metro', 'networkDomain'], axis=1, inplace=True)

geo['region']=geo.apply(region_country,axis=1)

df.drop(columns=['geoNetwork'], inplace=True, axis=1)
df = df.join(geo)

In [24]:
trafficFlatten = [flatten_json(json.loads(d)) for d in df['trafficSource']]
traffic = pd.DataFrame(trafficFlatten)
traffic.drop(['adwordsClickInfo_criteriaParameters', 'adwordsClickInfo_adNetworkType',
    'adwordsClickInfo_gclId', 'adwordsClickInfo_isVideoAd', 'adwordsClickInfo_page',
    'keyword', 'referralPath'], axis=1, inplace=True)

# rename to a simpler name
traffic.rename(columns={"adwordsClickInfo_slot": "adPosition"}, inplace=True)

# possible regex transformation on traffic['source'] for storing the main domain (last whatevs.com)
# traffic['source'] = traffic['source'].str.extract('{"browser": "(.+?)"', expand=True)

df.drop(columns=['trafficSource'], inplace=True, axis=1)
df = df.join(traffic)

In [25]:
#Transform json
totalsList = [flatten_json(json.loads(d)) for d in df['totals']]
totals_df = pd.DataFrame(totalsList)

# Remove visits column as it does not provide any information
# Remove sessionQualityDim: An estimate of how close a particular session was to transacting, ranging
    # from 1 to 100, calculated for each session. A value closer to 1 indicates a low session quality, or
    # far from transacting, while a value closer to 100 indicates a high session quality, or very close to
    # transacting. A value of 0 indicates that Session Quality is not calculated for the selected time range
# Remove transactionRevenue (deprecated) to use totalTransactionRevenue instead
totals_df = totals_df.drop(['visits', 'sessionQualityDim', 'transactionRevenue', 'hits', 'newVisits'], axis=1)

# Change nan in "transactionRevenue" and "bounces" for 0 
totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].fillna(0)
totals_df['bounces'] = totals_df['bounces'].fillna(0)

# There are some nan in pageviews but it is not trivial to find a value we can change these nan by logically.
# The corrlation with hits is 0.984, so we can keep hits that does not have any nan value. Keeping both could be redundant
totals_df = totals_df.drop(['pageviews'], axis=1)

# Bounces is mainly 1 when there is only 1 hit by defenition.
# This means that even though the correlation doesn't provide a big value beetween them two, bounces is not giving much 
# additional info
totals_df = totals_df.drop(['bounces'], axis=1)

totals_df['totalTransactionRevenue'] = totals_df['totalTransactionRevenue'].apply(lambda x: float(x)/1000000)

df.drop(['totals'], inplace=True, axis=1)
df = df.join(totals_df)

In [26]:
df = df.sort_values(by=['visitStartTime']).reset_index(drop=True)
df['prevPurchases'] = prev_purchases(df)

In [27]:
print(df.shape)
print(list(df))

(401589, 21)
['channelGrouping', 'fullVisitorId', 'visitNumber', 'visitStartTime', 'browser', 'deviceCategory', 'operatingSystem', 'city', 'country', 'region', 'subContinent', 'adContent', 'adPosition', 'campaign', 'isTrueDirect', 'medium', 'source', 'timeOnSite', 'totalTransactionRevenue', 'transactions', 'prevPurchases']


In [28]:
df.describe(include='all')

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
count,401589,401589.0,401589.0,401589,401589,401589,401589,401589,401589,401589,401589,401589,10605,401589,148409,401589,401589,217903.0,401589.0,6305.0,401589.0
unique,8,296530.0,,393643,62,3,22,503,208,474,23,28,5,26,1,7,192,3578.0,,7.0,
top,Organic Search,6.501071168742028e+17,,2018-05-18 06:44:23,Chrome,desktop,Windows,not available in demo dataset,United States,United States,Northern America,(not set),Google search: Top,(not set),True,organic,google,10.0,,1.0,
freq,198378,105.0,,9,305526,277648,138005,206434,180794,86895,193790,390841,10099,378933,148409,198378,208597,2242.0,,6151.0,
first,,,,2018-05-01 07:00:07,,,,,,,,,,,,,,,,,
last,,,,2018-10-16 06:59:26,,,,,,,,,,,,,,,,,
mean,,,2.486104,,,,,,,,,,,,,,,,1.534261,,0.001203
std,,,10.699105,,,,,,,,,,,,,,,,66.648229,,0.034659
min,,,1.0,,,,,,,,,,,,,,,,0.0,,0.0
25%,,,1.0,,,,,,,,,,,,,,,,0.0,,0.0


In [29]:
df.head(5)

Unnamed: 0,channelGrouping,fullVisitorId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,city,country,region,subContinent,adContent,adPosition,campaign,isTrueDirect,medium,source,timeOnSite,totalTransactionRevenue,transactions,prevPurchases
0,Referral,1778456890803926940,1.0,2018-05-01 07:00:07,Chrome,desktop,Macintosh,not available in demo dataset,United States,United States,Northern America,(not set),,(not set),,(none),(direct),416.0,0.0,,0.0
1,Direct,3835551007029341267,4.0,2018-05-01 07:00:49,Samsung Internet,mobile,Android,not available in demo dataset,United States,United States,Northern America,(not set),,(not set),True,(none),(direct),,0.0,,0.0
2,Referral,9389670963098968254,1.0,2018-05-01 07:01:12,Chrome,desktop,Windows,Mumbai,India,Maharashtra,Southern Asia,(not set),,(not set),,referral,analytics.google.com,,0.0,,0.0
3,Referral,9342773996212644551,3.0,2018-05-01 07:01:14,Chrome,desktop,Macintosh,Sunnyvale,United States,California,Northern America,(not set),,(not set),True,(none),(direct),682.0,0.0,,0.0
4,Social,311936945458161781,1.0,2018-05-01 07:01:43,Opera Mini,mobile,(not set),not available in demo dataset,Nigeria,Nigeria,Western Africa,(not set),,(not set),,referral,youtube.com,,0.0,,0.0


In [30]:
df.to_csv("../data/test_v2_cleaned.csv")
df.to_pickle("../data/test_v2_cleaned.pkl")