In [1]:
import pandas as pd
import glob 
import os
import matplotlib.pyplot as plt



In [2]:
path = './input'
all_files = glob.glob(os.path.join(path, "*"))


In [3]:
df_og = pd.DataFrame
li = []

for filename in all_files:
    df_temp = pd.read_csv(filename,
                sep=r"(?P<remote_addr>(?:^|\b(?<!\.))(?:1?\d\d?|2[0-4]\d|25[0-5])(?:\.(?:1?\d\d?|2[0-4]\d|25[0-5])){3}(?=$|[^\w.]))\s-\s(?P<remote_usr>-|[A-z_][A-z0-9_]{0,30})\s(?P<date_time>\[(?P<date>[0-2][0-9]\/\w{3}\/[12]\d{3}):(?P<time>\d\d:\d\d:\d\d).*\])\s(?P<request>\"(?P<req_method>GET|POST|HEAD|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH)\s(?P<req_uri>\/[^\s]*)\s(?P<http_ver>HTTP/\d\.\d)\")\s(?P<status>\d{3})\s(?P<body_byte_sent>\d+)\s\"(?P<http_referer>[^\s]+)\"\s\"(?P<user_agent>[^\"]+)\"",
                names="remote_addr remote_usr date_time date time request req_method req_uri http_ver status body_bytes_sent http_referer user_agent gzip_ratio".split(),
                header=None,
                on_bad_lines="skip",
                engine='python')
    li.append(df_temp)

df_og = pd.concat(li, axis=0, ignore_index=True)

df_og.shape

(28920, 14)

In [4]:
df = df_og.copy()

In [5]:
# Borrar elementos con IPv6
df = df.drop(df[df.remote_addr.str.len() > 17].index)

# Borrar registros con valores vacíos
df.drop(columns="gzip_ratio", inplace=True)
df = df.dropna()
print(df.isna().sum())

remote_addr        0
remote_usr         0
date_time          0
date               0
time               0
request            0
req_method         0
req_uri            0
http_ver           0
status             0
body_bytes_sent    0
http_referer       0
user_agent         0
dtype: int64


In [6]:
df.shape

(28878, 13)

In [7]:
# Decodificar url's ej, https%3A%2F%2Fmoodle.ucags.edu.mx => https://moodle.ucags.edu.mx

import urllib.parse

df["dec_req_uri"] = df.loc[:, "req_uri"]

df.loc[:, "dec_req_uri"] = df["dec_req_uri"].apply(urllib.parse.unquote)

In [8]:
# Extraer path (ruta de recurso solicitado), query (parametros adicionales en la url) y netloc (dominio al cuál se realiza la petición)

df.loc[:, "clean_path"] = df["dec_req_uri"].apply(lambda x: urllib.parse.urlparse(x).path)
df.loc[:, "clean_query_list"] = df["dec_req_uri"].apply(urllib.parse.urlparse).apply(lambda x: urllib.parse.parse_qsl(x.query))
df.loc[:, "domain"] = df["http_referer"].apply(lambda x: urllib.parse.urlparse(x).netloc)

In [9]:
# Tratamiento de horas y fechas para aplicar tratamientos numéricos
from datetime import datetime
import time

df.loc[:, "fdate"] = df["date"].apply(lambda x: datetime.strptime(x, "%d/%b/%Y"))
df.loc[:, "dateunixtimest"] = df["fdate"].apply(lambda x: time.mktime(x.timetuple()))
df.loc[:, "ftime"] = df["time"].apply(lambda x: datetime.strptime(x, "%H:%M:%S").time())
df.loc[:, "fabstime"] = df["ftime"].apply(lambda x: x.hour+x.minute/60.0)


In [10]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

df.iloc[200:203]

Unnamed: 0,remote_addr,remote_usr,date_time,date,time,request,req_method,req_uri,http_ver,status,body_bytes_sent,http_referer,user_agent,dec_req_uri,clean_path,clean_query_list,domain,fdate,dateunixtimest,ftime,fabstime
200,201.141.19.215,-,[23/Aug/2024:00:02:46 +0000],23/Aug/2024,00:02:46,"""GET /courses/11083/modules/items/962794 HTTP/1.1""",GET,/courses/11083/modules/items/962794,HTTP/1.1,302.0,150.0,https://canvas.ieec.mx/courses/11083/modules,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",/courses/11083/modules/items/962794,/courses/11083/modules/items/962794,[],canvas.ieec.mx,2024-08-23,1724371000.0,00:02:46,0.033333
201,201.141.19.215,-,[23/Aug/2024:00:02:49 +0000],23/Aug/2024,00:02:49,"""GET /courses/11083/quizzes/331378?module_item_id=962794 HTTP/1.1""",GET,/courses/11083/quizzes/331378?module_item_id=962794,HTTP/1.1,200.0,11368.0,https://canvas.ieec.mx/courses/11083/modules,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",/courses/11083/quizzes/331378?module_item_id=962794,/courses/11083/quizzes/331378,"[(module_item_id, 962794)]",canvas.ieec.mx,2024-08-23,1724371000.0,00:02:49,0.033333
202,201.141.19.215,-,[23/Aug/2024:00:02:49 +0000],23/Aug/2024,00:02:49,"""GET /api/v1/conversations/unread_count HTTP/1.1""",GET,/api/v1/conversations/unread_count,HTTP/1.1,304.0,0.0,https://canvas.ieec.mx/courses/11083/quizzes/331378?module_item_id=962794,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",/api/v1/conversations/unread_count,/api/v1/conversations/unread_count,[],canvas.ieec.mx,2024-08-23,1724371000.0,00:02:49,0.033333


In [11]:
df.shape

(28878, 21)

In [12]:
import pyarrow.parquet as pq

In [13]:
df.to_parquet("./output/full.parquet", engine='auto', compression='snappy')

In [14]:
# Aplicando la formula de Cochran para calcular el tamaño de muestra

# Z-score
z= 1.96

# Proporcion estimada de la población (50%)
p= 0.5

# Margen de error (5%)
e= 0.05

n0 = (z**2 * p * (1-p))/e**2

In [15]:
n0

384.1599999999999

In [16]:
# Ajustando a muestras pequeñas / finitas
sample_size = (n0)/(1+((n0-1)/(df.shape[0])))

In [17]:
sample_size

379.12962028846425

In [18]:
import math

sample = df.sample(n=math.ceil(sample_size))

In [19]:
sample.shape

(380, 21)

In [20]:
sample[["date", "time"]]

Unnamed: 0,date,time
16140,23/Aug/2024,15:20:42
26070,23/Aug/2024,18:59:12
13858,23/Aug/2024,13:35:44
11707,23/Aug/2024,09:30:18
24365,23/Aug/2024,18:27:38
...,...,...
28709,23/Aug/2024,20:52:24
5411,23/Aug/2024,03:09:13
25782,23/Aug/2024,18:50:07
21830,23/Aug/2024,17:27:01
