In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
DATA_DIR = Path('../data').resolve()
assert DATA_DIR.exists()

In [3]:
import os


def read_logs(filename, ziped=False) -> pd.DataFrame:
    """Read logs and expand inner JSON values

    Recover valuable info

    Arguments:
        filename String
        ziped bool to unzip file
    Returns:
        expanded_records List of dictionnaries
    """

    def add_if_available(row, field, subfield):
        try:
            return row[field][subfield]
        except Exception:
            return np.nan

    # Either load records directly or do a json.loads and pandas.json_normalize
    df = pd.read_json(filename, compression="gzip" if ziped else None, lines=True)
    df["context.course_id"] = df.apply(
        lambda x: add_if_available(x, "context", "course_id"), axis=1
    )
    df["context.org_id"] = df.apply(
        lambda x: add_if_available(x, "context", "org_id"), axis=1
    )
    df["context.path"] = df.apply(
        lambda x: add_if_available(x, "context", "path"), axis=1
    )
    df["context.user_id"] = df.apply(
        lambda x: add_if_available(x, "context", "user_id"), axis=1
    )
    # Note: extra values to be extracted should be added here
    return df


In [4]:
sample_json.keys()

dict_keys(['context', 'username', 'session', 'agent', 'host', 'referer', 'accept_language', 'event', 'time', 'event_type', 'event_source', 'page'])

In [5]:
sample = pd.DataFrame.from_records(sample_json)

In [6]:
sample.shape

(300, 12)

In [7]:
sample

Unnamed: 0,accept_language,agent,context,event,event_source,event_type,host,page,referer,session,time,username
67243,,,"{'user_id': None, 'path': '/edxucursos/externo...","{""GET"": {""ticket"": [""095b5cbeaf0f5fafbd1a7c67e...",server,/edxucursos/externo,eol.uchile.cl,,,,2023-04-03T15:19:49.724704+00:00,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...
138051,"es-ES,es;q=0.9",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,"{'user_id': 86459, 'path': '/event', 'course_i...","{""id"": ""1929ac93da0340fb82a1ab1c4ead8b16"", ""co...",browser,load_video,eol.uchile.cl,https://eol.uchile.cl/courses/course-v1:eol+FA...,https://eol.uchile.cl/courses/course-v1:eol+FA...,593c582e46365b3f0131a6ba45c1f020,2023-04-03T23:33:22.505486+00:00,ed31b867c1124ff082e52fbaba03e11865e4da8e38bed1...
91742,,,"{'user_id': None, 'path': '/edxucursos/externo...","{""GET"": {""ticket"": [""83f050b3e7fd35c3f8d5d351c...",server,/edxucursos/externo,eol.uchile.cl,,,,2023-04-03T17:08:17.731079+00:00,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...
59930,"es-419,es;q=0.9",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6...,"{'user_id': 10664, 'path': '/event', 'course_i...",input_48405adfc393404187ea34ddc4b8cf99_2_1=Falso,browser,problem_check,eol.uchile.cl,https://eol.uchile.cl/courses/course-v1:eol+FC...,https://eol.uchile.cl/courses/course-v1:eol+FC...,3a83e338ad65749b220bc68ff3d6db64,2023-04-03T14:25:58.961664+00:00,ef260ef56ad2a0094843cc758b746a71a531f6f15fabd9...
1071,"es-US,es-419;q=0.9,es;q=0.8,en;q=0.7",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,"{'user_id': 28583, 'path': '/event', 'course_i...","{""id"": ""eaa4df4f12c8417b9c06305832d1675d"", ""co...",browser,play_video,eol.uchile.cl,https://eol.uchile.cl/courses/course-v1:eol+IN...,https://eol.uchile.cl/courses/course-v1:eol+IN...,5d38ae6950793cc1b737403273088b9b,2023-04-03T00:04:13.823382+00:00,e26d1d67c708b50eaaa6a48cda1e47307d457a5fb591cd...
...,...,...,...,...,...,...,...,...,...,...,...,...
112645,"es-ES,es;q=0.9",Mozilla/5.0 (iPhone; CPU iPhone OS 16_4 like M...,{'course_id': 'course-v1:eol+FACSO-TDA+2022_1'...,"{""GET"": {""fbclid"": [""PAAaYU3gyLJWd3YxFdjJ8MzIC...",server,/courses/course-v1:eol+FACSO-TDA+2022_1/about,eol.uchile.cl,,https://l.instagram.com/,,2023-04-03T20:25:51.455901+00:00,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...
65702,"es-ES,es;q=0.9",Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,"{'course_id': 'course-v1:eol+ENNEG388+2023_1',...","{""GET"": {}, ""POST"": {}}",server,/courses/course-v1:eol+ENNEG388+2023_1/course/,eol.uchile.cl,,https://eol.uchile.cl/courses/course-v1:eol+EN...,d33061f84a2a20221095cb750872bc1c,2023-04-03T15:04:37.771011+00:00,43f74274c312f52f5165bb627a7c85e95fd0c7ae9453aa...
130881,,,"{'course_id': 'course-v1:eol+FEN-MO+2023_1', '...","{'user_id': '10980', 'course_id': 'course-v1:e...",server,edx.grades.subsection.grade_calculated,,,,,2023-04-03T22:38:59.936112+00:00,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...
133574,,,"{'course_id': 'course-v1:eol+IN2201+2023_1', '...","{'user_id': '46651', 'course_id': 'course-v1:e...",server,edx.grades.subsection.grade_calculated,,,,,2023-04-03T23:10:31.504335+00:00,e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b93...


In [24]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 67243 to 64515
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   accept_language  300 non-null    object
 1   agent            300 non-null    object
 2   context          300 non-null    object
 3   event            300 non-null    object
 4   event_source     300 non-null    object
 5   event_type       300 non-null    object
 6   host             300 non-null    object
 7   page             82 non-null     object
 8   referer          300 non-null    object
 9   session          300 non-null    object
 10  time             300 non-null    object
 11  username         300 non-null    object
dtypes: object(12)
memory usage: 30.5+ KB


In [32]:
json.loads(sample.event.iloc[0])

{'GET': {'ticket': ['095b5cbeaf0f5fafbd1a7c67e53f014aaba73232']}, 'POST': {}}

In [34]:
def parse_event(event: str) -> dict:
    try:
        return json.loads(event)
    except json.JSONDecodeError:
        return {}
    
sample['event'] = sample.event.apply(parse_event)

TypeError: the JSON object must be str, bytes or bytearray, not dict