In [5]:
!rm -rf sample_data

In [2]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [6]:
DATA_DIR = Path('./data').resolve()
assert DATA_DIR.exists()

back/core/processing.py

In [7]:
# Similar a read_json_course
sample_json = json.loads((DATA_DIR / "sample.json").read_text())
sample = pd.DataFrame.from_records(sample_json)

In [8]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 67243 to 64515
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   accept_language  300 non-null    object
 1   agent            300 non-null    object
 2   context          300 non-null    object
 3   event            300 non-null    object
 4   event_source     300 non-null    object
 5   event_type       300 non-null    object
 6   host             300 non-null    object
 7   page             82 non-null     object
 8   referer          300 non-null    object
 9   session          300 non-null    object
 10  time             300 non-null    object
 11  username         300 non-null    object
dtypes: object(12)
memory usage: 38.6+ KB


In [10]:
# Al parecer el compressed sample no trae name ni ip
# (columnas que si estan en el modelo de Django)
def read_logs(df):
    """ Read logs and expand inner JSON values

    Recover valuable info

    Arguments:
        dataframe Pandas DataFrame
    Returns:
        expanded_records List of dictionnaries
    """
    def add_if_available(row, field, subfield):
        try:
            return row[field][subfield]
        except Exception:
            return np.nan
    df["context.course_id"] = df.apply(
        lambda x: add_if_available(x, "context", "course_id"), axis=1)
    df["context.org_id"] = df.apply(
        lambda x: add_if_available(x, "context", "org_id"), axis=1)
    df["context.path"] = df.apply(
        lambda x: add_if_available(x, "context", "path"), axis=1)
    df["context.user_id"] = df.apply(
        lambda x: add_if_available(x, "context", "user_id"), axis=1)
    # Note: extra values to be extracted should be added here
    return df

In [11]:
data_df = read_logs(sample)
log_df = data_df[data_df.apply(lambda row: row["username"] != '', axis=1)]
log_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 67243 to 64515
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accept_language    300 non-null    object 
 1   agent              300 non-null    object 
 2   context            300 non-null    object 
 3   event              300 non-null    object 
 4   event_source       300 non-null    object 
 5   event_type         300 non-null    object 
 6   host               300 non-null    object 
 7   page               82 non-null     object 
 8   referer            300 non-null    object 
 9   session            300 non-null    object 
 10  time               300 non-null    object 
 11  username           300 non-null    object 
 12  context.course_id  300 non-null    object 
 13  context.org_id     300 non-null    object 
 14  context.path       254 non-null    object 
 15  context.user_id    205 non-null    float64
dtypes: float64(1), object(15)

Stat: Videos (process_log_views)


back/views/tasks.py

In [14]:
# Funciones auxiliares para process_log_views
def get_video_logs(course_data):
        """
        From course logs, returns video type logs like play_video or stop_video
        """
        video_event_type = [
            'hide_transcript',
            'load_video',
            'pause_video',
            'play_video',
            'seek_video',
            'show_transcript',
            'speed_change_video',
            'stop_video',
            'video_hide_cc_menu',
            'video_show_cc_menu',
        ]
        video_mobile_type = [
            'edx.video.transcript.hidden',
            'edx.video.loaded',
            'edx.video.paused',
            'edx.video.played',
            'edx.video.position.changed',
            'edx.video.transcript.shown',
            'edx.video.stopped',
        ]
        raw_video_course_logs = course_data[course_data.event_type.isin(
            video_event_type + video_mobile_type)]
        return raw_video_course_logs

In [16]:
video_logs_df = get_video_logs(log_df)

In [None]:
def make_segment_dataframe(grouped):
        """
        Creates a dataframe with start-stop segment per user and video
        """
        df_cols = ['id', 'username', 'time', 'start', 'end']
        all_pairs = []
        malformed_pairs = 0
        for name, group in grouped:
            state = {'id': None, 'init': None, 'action': None}
            for index, row in group.iterrows():
                etype = row.event_type
                if etype == 'play_video':
                    state['id'] = row.id
                    state['init'] = row.currenttime
                    state['action'] = row.event_type
                elif etype == 'seek_forward':
                    if state['id'] == row.id:
                        # skip segment
                        # all_pairs.append
                        if state['action'] == 'play_video':
                            try:
                                assert (row.old <= row.new)
                                all_pairs.append((row.id, row.username,
                                                  row.time, row.old, row.new))
                            except AssertionError:
                                malformed_pairs += 1

                    state['id'] = row.id
                    state['init'] = row.currenttime
                    state['action'] = row.event_type
                elif etype == 'seek_back':
                    if state['id'] == row.id and state[
                            'action'] == 'play_video':
                        try:
                            assert (state['init'] <= row.old)
                            all_pairs.append((row.id, row.username, row.time,
                                              state['init'], row.old))
                        except AssertionError:
                            malformed_pairs += 1

                    state['id'] = row.id
                    state['init'] = row.currenttime
                    state['action'] = row.event_type
                elif etype == 'pause_video' or etype == 'stop_video':
                    if state['id'] == row.id and state[
                            'action'] == 'play_video':
                        try:
                            assert (state['init'] <= row.currenttime)
                            all_pairs.append((row.id, row.username, row.time,
                                              state['init'], row.currenttime))
                        except AssertionError:
                            malformed_pairs += 1
                    state['id'] = row.id
                    state['init'] = row.currenttime
                    state['action'] = row.event_type
                else:
                    continue
        print("Malformed pairs processed: {}".format(malformed_pairs))
        segments_df = pd.DataFrame(all_pairs, columns=df_cols)
        return segments_df

back/views/processing.py

In [18]:
# Funciones auxiliares para process_views
pps_videore = re.compile(
    r'((?<="duration": )[0-9.]+?(?=,|}))|((?<="code": ").*?(?="))|((?<="id": ").*?(?="))|((?<="currentTime": )[0-9.]+?(?=,|}))'
)
load_videore = re.compile(
    r'((?<="duration": )[0-9.]+?(?=,|}))|((?<="code": ").*?(?="))|((?<="id": ").*?(?="))'
)
seek_videore = re.compile(
    r'((?<="code": ").*?(?="))|((?<="new_time": )[0-9.]+?(?=,|}))|((?<="old_time": )[0-9.]+?(?=,|}))|((?<="duration": )[0-9.]+?(?=,|}))|((?<="type": ").*?(?="))|((?<="id": ").*?(?="))'
)
speedchange_videore = re.compile(
    r'((?<="current_time": )[0-9.]+?(?=,|}))|((?<="old_speed": ")[0-9.]+?(?="))|((?<="code": ").*?(?="))|((?<="new_speed": ")[0-9.]+?(?="))|((?<="duration": )[0-9.]+?(?=,|}))|((?<="id": ").*?(?="))'
)


def reduce_to_tuple(groupTuples):
    """
    Reduce an array of tuples to a single tuple
    Eg: [("","","id"),("duration","",""),("","code","")]
        returns ("duration","code","id")
    """
    data = np.array(groupTuples)
    match = pd.DataFrame(data)
    return tuple(match.sum(axis=0))

def video_info_parser(row):
    """
    Returns video id and its duration tuple from event column,
    using only play, pause and stop type events
    """
    etype = row.event_type
    pps = ['play_video', 'pause_video', 'stop_video']
    if etype in pps:
        match = reduce_to_tuple(pps_videore.findall(row.event))
        return match[2], float(match[0])
    elif etype == 'load_video':
        match = reduce_to_tuple(load_videore.findall(row.event))
        return match[2], float(match[0])
    elif etype == 'seek_video':
        match = reduce_to_tuple(seek_videore.findall(row.event))
        return match[5], float(match[3])
    elif etype == 'speed_change_video':
        match = reduce_to_tuple(speedchange_videore.findall(row.event))
        return match[5], float(match[4])
    else:
        return


def generate_video_dataframe(raw_video_data):
    """
    From video type logs, catch videos into it and creates a dataframe
    with its id and duration
    """
    video_cols = ['id', 'duration']
    video_tuples = []
    for index, row in raw_video_data.iterrows():
        video_tuples.append(video_info_parser(row))
    all_video_df = pd.DataFrame(video_tuples,
                                columns=video_cols).drop_duplicates()

    return all_video_df

In [20]:
videos_df = generate_video_dataframe(video_logs_df)

In [None]:
def video_event_expander(row):
    """
    Returns a tuple with video player info,
    like where user started and stopped the video,
    from event column using only play, pause and stop type events
    """
    etype = row.event_type
    pps = ['play_video', 'pause_video', 'stop_video']
    if etype in pps:
        match = reduce_to_tuple(pps_videore.findall(row.event))
        return (row.username, row.time, row.event_type, match[2],
                float(match[0]), float(match[3]), float(match[3]),
                float(match[3]))

    elif etype == 'load_video':
        match = reduce_to_tuple(load_videore.findall(row.event))
        return (row.username, row.time, row.event_type, match[2],
                float(match[0]), -1, -1, -1)

    elif etype == 'seek_video':
        match = reduce_to_tuple(seek_videore.findall(row.event))
        old = float(match[2])
        new = float(match[1])
        # Seek video forward
        if old < new:
            return (row.username, row.time, 'seek_forward', match[5],
                    float(match[3]), new, old, new)
        # Seek video backward
        elif old > new:
            return (row.username, row.time, 'seek_back', match[5],
                    float(match[3]), new, old, new)
        else:
            return (row.username, row.time, 'seek_equal', match[5],
                    float(match[3]), new, old, new)
    elif etype == 'speed_change_video':
        match = reduce_to_tuple(speedchange_videore.findall(row.event))
        old = float(match[1])
        new = float(match[3])
        if old < new:  # Speed increased
            return (row.username, row.time, 'speed_change_up', match[5],
                    float(match[4]), float(match[0]), old, new)
        elif old > new:  # Speed decreased
            return (row.username, row.time, 'speed_change_down', match[5],
                    float(match[4]), float(match[0]), old, new)
        else:
            return (row.username, row.time, 'speed_change_equal', match[5],
                    float(match[4]), float(match[0]), old, new)

    else:
        return


def expand_event_info(video_logs):
    """
    From video type logs, creates a dataframe
    with expanded video player info,
    initially contained in the event column
    """
    dframe_cols = [
        'username', 'time', 'event_type', 'id', 'duration', 'currenttime',
        'old', 'new'
    ]
    extend_video_tuples = []
    for index, row in video_logs.iterrows():
        extend_video_tuples.append(video_event_expander(row))
    extend_video_logs = pd.DataFrame(extend_video_tuples, columns=dframe_cols)
    return extend_video_logs

back/views/tasks.py

In [None]:
# Funcion auxiliar para process_log_views
def process_views(dataframe):
  # original process_views :: dataframe course_df date course_id code -> Django models
        cols_to_use = ['username', 'time', 'event_type', 'event']
        raw_video_course_logs = get_video_logs(dataframe[cols_to_use])
        videos_in_logs = generate_video_dataframe(raw_video_course_logs) # videos en el curso
        # course_id_df = course_df["course"][0]

        extend_video_logs = expand_event_info(raw_video_course_logs) #
        sort_video_logs = extend_video_logs[
            extend_video_logs.event_type != 'load_video'].copy()
        sort_video_logs['time'] = pd.to_datetime(sort_video_logs['time'],
                                                 unit='ns')
        sort_video_logs.sort_values('time', inplace=True)
        grouped_logs = sort_video_logs.groupby('username')
        segments_df = make_segment_dataframe(grouped_logs) # segmentos de video vistos
        views_df = segments_df[['id', 'username']].drop_duplicates() # visitas a los videos
        return [videos_in_logs, segments_df, views_df]

back/core/processing.py

In [31]:
# Funcion auxiliar para process_logs_single_course
def filter_course_team(logs, user_field_name='username', other_people=None):
    """Keeps the users that are not part of the course team

    Arguments:
        logs {pandas.core.frame.DataFrame} -- DataFrame to filter

    Keyword Arguments:
        user_field_name {str} -- name of the user name field  (default: {'username'})
        other_people {list} -- people known to be part of the course/page team (default: {None})

    Returns:
        pandas.core.frame.DataFrame -- [description]
    """
    def bool_regex(x):
        rgx = re.compile(r'.*(studio|instructor).*')
        if rgx.match(x) != None:
            return 1
        else:
            return 0

    users_and_etypes = logs[[user_field_name, 'event_type']].copy()
    users_and_etypes['event_type'] = users_and_etypes['event_type'].apply(
        bool_regex)
    users_and_profes = users_and_etypes.groupby(user_field_name).sum().reset_index()\
        .sort_values('event_type', ascending=False)
    students = users_and_profes[users_and_profes.event_type ==
                                0][user_field_name]

    if other_people is not None:
        students = students[~students.isin(other_people)]

    return logs[logs.username.isin(students)]

def filter_by_log_qty(logs, min_logs=15, user_field_name='username'):
    """Keeps the users with more than min_logs logs in the course

    Arguments:
        logs {pandas.core.frame.DataFrame} -- DataFrame to filter

    Keyword Arguments:
        min_logs {int} -- min quantity of logs to stay in the df (default: {15})
        user_field_name {str} -- name of the user name field (default: {'username'})

    Returns:
        pandas.core.frame.DataFrame -- [description]
    """
    users_count = logs.groupby([user_field_name])[user_field_name]\
        .count()\
        .to_frame()\
        .rename(columns={user_field_name: 'count'})\
        .reset_index()
    active_users = users_count[users_count['count']
                               > min_logs][user_field_name]
    return logs[logs.username.isin(active_users)]

In [None]:
from datetime import timedelta, datetime, date
# Task para crear estadisticas entorno a videos
# Crea videos, cantidad de visitas, segmentos vistos del video
def process_log_views(end_date=None, day_window=None, run_code=None, course=None):
  #originalmente llama a process_logs_single_course(process_views, "views", course, end_date, day_window, run_code)

  #time_window = DAY_WINDOW if day_window is None else day_window
  time_window = None
  #tz = pytz.timezone(settings.TIME_ZONE)
  #end_date_localized = None if end_date is None else tz.localize(end_date)
  end_date_localized = None
  #staff_users = StaffUserName.objects.all()
  staff_users = []

  # course_logs = Log.objects.filter(time__gte=(end_date_localized - time_window),
  #                                  time__lt=(end_date_localized),
  #                                  course_id=course_id
  #                                  ).values('username', 'event_type', 'name', 'referer', 'time', 'event', 'course_id', 'org_id', 'user_id', 'path', 'page')
  course_logs =
  if course_logs.count() == 0:
        return
  logs_full = pd.DataFrame(course_logs)

  users = [u.username for u in staff_users], filter_by_log_qty

  logs = filter_course_team(logs_full, other_people=users)

  day_logs = logs[logs.time.dt.date == period]

  periods = pd.date_range(start=end_date_localized - time_window,
                            end=end_date_localized, freq=timedelta(days=1), tz=settings.TIME_ZONE)
  for period in periods:
        day_logs = logs[logs.time.dt.date == period]
        # Continue if no logs exists
        count, _ = day_logs.shape
        if count == 0:
            continue
        #procedure(day_logs, course_dataframe, period, course_id, run_code)
        process_views(day_logs)
