In [46]:
import pandas as pd
from pathlib import Path

In [47]:
ROOT_DIR = Path().resolve(strict=True).parent
DOMAIN = "https://inversionesyfinanzas.xyz"
DATA_DIR = f"{ROOT_DIR}/data/"
visiteurs_df = pd.read_csv(f"{DATA_DIR}/raw/visiteurs.csv")

In [48]:
visiteurs_df.head()

Unnamed: 0,id,ip,session_id,http_user_agent,country_code,country_name,dma_code,is_in_european_union,latitude,longitude,city,region,time_zone,postal_code,continent_code,continent_name,first_visit_date,is_bot
0,1,200.125.229.58,1w90e381zb92nnwgb9wgqxzg1f6l6ipt,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,EC,Ecuador,,f,-0.2143,-78.5017,,P,America/Guayaquil,,SA,South America,2022-03-09 03:32:07.12474+01,f
1,2,187.162.27.37,e2z5ppwq393jt4hahpj1rrh4zn1iyi61,Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like...,MX,Mexico,,f,25.8243,-100.1909,Monterrey,NLE,America/Monterrey,64637.0,,North America,2022-03-09 04:34:44.016919+01,f
2,3,187.207.19.1,70nuv6qs6ic2exhh4hvf1bzx19ns646z,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,MX,Mexico,,f,19.2928,-99.1612,Tlalpan,CMX,America/Mexico_City,14000.0,,North America,2022-03-09 09:17:12.907113+01,f
3,4,213.195.109.213,mlih0uc2915ntcsedkox5li7fkpc53ed,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,ES,Spain,,t,41.387,2.1701,Barcelona,CT,Europe/Madrid,8003.0,EU,Europe,2022-03-09 10:57:21.572871+01,f
4,5,201.190.251.223,nrt2bfcfkqta1qjlifmykofgn3oild1s,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,AR,Argentina,,f,-32.8817,-68.8125,Mendoza,M,America/Argentina/Mendoza,5500.0,SA,South America,2022-03-09 15:06:16.701847+01,f


In [49]:
# We remove these fields are some are "useless" as we have the country code that is shorter than the
# country name it's more usefull. Same apply for continent_*, dma_code, lat, long and so on.
# The first visit and the session id won't be necessary for now
visiteurs_relevant_df = visiteurs_df.drop(
        columns=[
            "country_name",
            "continent_code",
            "continent_name",
            "session_id",
            "first_visit_date",
            "dma_code",
            "is_in_european_union",
            "latitude",
            "longitude",
            "city",
            "region",
            "time_zone",
            "postal_code",
        ]
    )

In [50]:
visiteurs_relevant_df.head()

Unnamed: 0,id,ip,http_user_agent,country_code,is_bot
0,1,200.125.229.58,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,EC,f
1,2,187.162.27.37,Mozilla/5.0 (iPhone; CPU iPhone OS 15_3_1 like...,MX,f
2,3,187.207.19.1,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,MX,f
3,4,213.195.109.213,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,ES,f
4,5,201.190.251.223,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,AR,f


In [51]:
visiteurs_relevant_df.isna().any()

id                 False
ip                 False
http_user_agent     True
country_code        True
is_bot             False
dtype: bool

In [52]:
visiteurs_relevant_df["http_user_agent"] = visiteurs_relevant_df["http_user_agent"].fillna("bot")
visiteurs_relevant_df["country_code"] = visiteurs_relevant_df["country_code"].fillna("unknown")

In [53]:
visits_historial_df = pd.read_csv(f"{DATA_DIR}/raw/visits_historial_visiteurs.csv")

In [54]:
visits_historial_df.head()

Unnamed: 0,id,date,current_path,comes_from,user_id,parsed
0,1,2022-03-09 03:32:07.129352+01,"('/publicaciones/que-es-el-price-to-book/',)",https://www.google.com/,1,f
1,2,2022-03-09 04:34:44.020298+01,"('/publicaciones/que-es-el-price-to-book/',)",https://www.google.com.mx/,2,f
2,3,2022-03-09 04:35:32.797788+01,"('/publicaciones/que-es-el-price-to-book/',)",https://www.google.com.mx/,2,f
3,4,2022-03-09 09:17:12.909467+01,"('/definicion/cash-on-cash-return-en-español/',)",https://www.google.com/,3,f
4,5,2022-03-09 10:57:21.575207+01,"('/favicon.ico',)",https://lucas.inversionesyfinanzas.xyz/static/...,4,f


In [55]:
# Parsed it's used on the webapp to know if the record has been already saved
visits_historial_relevant_df = visits_historial_df.drop(
        columns=[
            "id",
            "parsed",
        ])

In [56]:
# We transform id of visiteurs to user_id so we can merged the two df
visiteurs_relevant_df["user_id"] = visiteurs_relevant_df["id"]
visiteurs_relevant_df = visiteurs_relevant_df.drop(
        columns=[
            "id",
        ])

In [57]:
final_df = pd.merge(visiteurs_relevant_df, visits_historial_relevant_df, on=["user_id"])

In [58]:
final_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 448603 entries, 0 to 448602
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip               448603 non-null  object
 1   http_user_agent  448603 non-null  object
 2   country_code     448603 non-null  object
 3   is_bot           448603 non-null  object
 4   user_id          448603 non-null  int64 
 5   date             448603 non-null  object
 6   current_path     448603 non-null  object
 7   comes_from       448603 non-null  object
dtypes: int64(1), object(7)
memory usage: 30.8+ MB


feather - for short term storage (save time) also known as Arrow
parquet - for long term storage (save space)

In [59]:
print('CSV')
print("Writing time:")
%time final_df.to_csv('test.csv')
print("Reading time:")
%time df_csv = pd.read_csv('test.csv')
print("*"*100)
print('Pickle')
print("Writing time:")
%time final_df.to_pickle('test.pickle')
print("Reading time:")
%time df_pickle = pd.read_pickle('test.pickle')
print("*"*100)
print('Parquet')
print("Writing time:")
%time final_df.to_parquet('test.parquet')
print("Reading time:")
%time df_parquet = pd.read_parquet('test.parquet')
print("*"*100)
print('Feather')
print("Writing time:")
%time final_df.to_feather('test.feather')
print("Reading time:")
%time df_feather = pd.read_feather('test.feather')
print("*"*100)

CSV
Writing time:
CPU times: user 2.21 s, sys: 143 ms, total: 2.35 s
Wall time: 2.38 s
Reading time:
CPU times: user 978 ms, sys: 51.8 ms, total: 1.03 s
Wall time: 1.03 s
****************************************************************************************************
Pickle
Writing time:
CPU times: user 288 ms, sys: 36 ms, total: 324 ms
Wall time: 337 ms
Reading time:
CPU times: user 150 ms, sys: 28.4 ms, total: 179 ms
Wall time: 178 ms
****************************************************************************************************
Parquet
Writing time:
CPU times: user 353 ms, sys: 67.5 ms, total: 421 ms
Wall time: 417 ms
Reading time:
CPU times: user 404 ms, sys: 177 ms, total: 581 ms
Wall time: 333 ms
****************************************************************************************************
Feather
Writing time:
CPU times: user 288 ms, sys: 46.5 ms, total: 334 ms
Wall time: 242 ms
Reading time:
CPU times: user 279 ms, sys: 108 ms, total: 387 ms
Wall time: 306 ms
***

CSV
Writing time:
CPU times: user 2.21 s, sys: 143 ms, total: 2.35 s
Wall time: 2.38 s
Reading time:
CPU times: user 978 ms, sys: 51.8 ms, total: 1.03 s
Wall time: 1.03 s
****************************************************************************************************
Pickle
Writing time:
CPU times: user 288 ms, sys: 36 ms, total: 324 ms
Wall time: 337 ms
Reading time:
CPU times: user 150 ms, sys: 28.4 ms, total: 179 ms
Wall time: 178 ms
****************************************************************************************************
Parquet
Writing time:
CPU times: user 353 ms, sys: 67.5 ms, total: 421 ms
Wall time: 417 ms
Reading time:
CPU times: user 404 ms, sys: 177 ms, total: 581 ms
Wall time: 333 ms
****************************************************************************************************
Feather
Writing time:
CPU times: user 288 ms, sys: 46.5 ms, total: 334 ms
Wall time: 242 ms
Reading time:
CPU times: user 279 ms, sys: 108 ms, total: 387 ms
Wall time: 306 ms
****************************************************************************************************


In [60]:
!ls -Flash test.feather test.parquet test.pickle test.csv

103M -rw-rw-r-- 1 lucas lucas 103M nov 27 10:20 test.csv
 26M -rw-rw-r-- 1 lucas lucas  26M nov 27 10:20 test.feather
 15M -rw-rw-r-- 1 lucas lucas  15M nov 27 10:20 test.parquet
 46M -rw-rw-r-- 1 lucas lucas  46M nov 27 10:20 test.pickle
