# Data Preprocessing

### START EDIT

In [1]:
input_events_file = 'events\ASUx-ENG101x-3T2015_events.csv'
course_structure_file = 'course_structure_files\ASUx-ENG101x-2171A.csv'
username_id = 'username_id.csv'

events_course_id = 'course-v1:ASUx+ENG101x+2171A' 
course_structure_course_id = 'ASUx-ENG101x-2171A'


output_file = 'combined_events/course_structure_events_join_ASUx+ENG101x+2171A.csv'

### END EDIT

In [43]:
# import libraries 
import pandas as pd
import numpy as np
import string
import datetime 
import warnings 
import json
warnings.filterwarnings("ignore")
pd.set_option('max_colwidth',1000)
pd.set_option('display.max_columns',100)

In [44]:
# read in events file 
events = pd.read_csv(input_events_file)

# read in course structure file 
course_structure = pd.read_csv(course_structure_file)

# read in username_id file -- this will translate the usernames to ids 
username_id = pd.read_csv(username_id)

In [45]:
# convert course_structure columns to all lowercase 
course_structure.columns = map(str.lower, course_structure.columns)

In [46]:
# parse out the user_ids from the events column, if available 

def parse_user_id(x):
    try:
        string = json.loads(x)
        if "user_id" in string:
            return string["user_id"]
    except ValueError:
        return np.NaN

events["user_id"] = events.apply(lambda x: parse_user_id(x['event']),axis=1)


In [47]:
username_id['id'] = username_id['id'].astype('str')

In [48]:
# join to username_id table 

events = events.merge(username_id, how = 'left', left_on = 'user_id', right_on = 'id')

In [49]:
def change_username(x,y):
    if pd.isnull(x) and pd.notnull(y):
        return y
    else:
        return x 

events['username'] = events.apply(lambda x: change_username(x['username_x'],x['username_y']),axis = 1)

In [50]:
events = events.drop(['username_x','username_y','user_id','id'],axis = 1)

In [51]:
# convert time column to datetime 
events['time'] = pd.to_datetime(events['time'])

In [52]:
# remove record where event is NA 
events = events.dropna(subset = ['name'])

# remove records where event = page_close 
events = events[events['name'] != 'page_close']

# remove irrelevant courses --  CSE110x fall course 
events = events[events['context_course_id'] == events_course_id]
course_structure = course_structure[course_structure['folder_course_id']==course_structure_course_id]

# reset index 
events = events.reset_index()

In [53]:
# for event_name = edx.ui.lms.link_clicked, remove current_url so that the only one left is the the target_url. 
# the current_url is where the user is navigating FROM, which we don't care about. 

def remove_current_url(event, event_name):
    if event_name == 'edx.ui.lms.link_clicked':
        try:
            event_json = json.loads(event)
            return event_json['target_url']
        except ValueError:
            return ""
    else:
        return event

events['old_event'] = events['event']
events['event'] = events.apply(lambda x: remove_current_url(x['old_event'],x['name']), axis = 1)

In [54]:
# take only the necessary columns from course structure table 
course_structure_index = course_structure.loc[:,[    
    'chapter_id', 'chapter_order', 'chapter_display_name', 'chapter_start',
    'vertical_id', 'vertical_order', 'vertical_display_name',
    'sequential_id', 'sequential_order', 'sequential_display_name',
    'smallestunit_discussion_id', 'smallestunit_id', 'smallestunit_order',
    'smallestunit_type', 'smallestunit_display_name', 
    'course_end','course_enrollment_end','course_start'
]]

## obtain_id Function 

Purpose: Separate all of the ids (chapter_id, sequential_id, vertical_id, smallestunit_discussion_id, smallestunit_id) in the 'event' column of the tracking logs so that we can match it to the ids course_structure file. This function will look for the 5 different id types and create new columns to store them in. 

In [55]:
ids = ['chapter_id','vertical_id','sequential_id','smallestunit_discussion_id','smallestunit_id']

In [56]:
def obtain_id(id_type,output):
    # create series of event names in the events (log) table 
    event_names = events['event'] 
    
    # get a list of all of the unique ids from the course structure table
    unique = course_structure[id_type].unique()

    # for each unique id, see if it's in any part of the events 
    lst = []
    for i in unique:
        sub_lst = [] 
        contains_string = pd.DataFrame(event_names.str.contains(i, regex = False))
        sub_lst.append(contains_string[contains_string['event'] == True].index.values)
        sub_lst.append(i)
        lst.append(sub_lst)

    lst_df = pd.DataFrame(lst)
    lst_df['count'] = lst_df[0].apply(lambda x: len(x))
    lst_df = lst_df[lst_df['count'] != 0]

    df = pd.DataFrame(columns = [id_type,'index']) 
    for i, row in lst_df.iterrows(): 
        for i in row[0]:
            df = df.append({'index':i, id_type:row[1]}, ignore_index = True)

    events['index'] = events.index
    events['index'] = events['index'].astype('int64')
    df['index'] = df['index'].astype('int64')
    output = df.merge(events.loc[:,['index','name']], on = 'index')
    
    return output


# initialize outputs to dataframe
output1 = pd.DataFrame
output2 = pd.DataFrame
output3 = pd.DataFrame
output4 = pd.DataFrame
output5 = pd.DataFrame


# Feed into obtain-id function 

chapter_id_output = obtain_id('chapter_id',output1)
vertical_id_output = obtain_id('vertical_id',output2)
sequential_id_output = obtain_id('sequential_id',output3)
smallestunitdiscussion_id_output = obtain_id('smallestunit_discussion_id',output4)
smallestunit_id_output = obtain_id('smallestunit_id',output5)


# join outputs into one table 

final_output = chapter_id_output.merge(
    vertical_id_output, how = 'outer',on = 'index').merge(
    sequential_id_output, how = 'outer',on = 'index').merge(
    smallestunitdiscussion_id_output, how = 'outer',on = 'index').merge(
    smallestunit_id_output, how = 'outer',on = 'index')


# consolidate all names and drop rest of name columns 
final_output['final_name'] = final_output.loc[:,['name','name_x','name_y']].bfill(axis=1).iloc[:,0]

final_output = final_output.drop(['name','name_x','name_y'], axis = 1)

## Merge final_output with event log 

In [57]:
testing_merge = events.merge(final_output, on = 'index', how = 'inner')

## Course Structure Re-Structure

In [58]:
# replace null smallestunit_discussion_ids with smallest_unit_id
# this is because the we have to standardize the hierarchies. also did the same thing with the course_structure data 
testing_merge['smallestunit_discussion_id'].fillna(testing_merge['smallestunit_id'], inplace = True)

# drop column 
testing_merge = testing_merge.drop(['smallestunit_id'], axis = 1)

# rename column 
testing_merge = testing_merge.rename(columns={"smallestunit_discussion_id":"smallestunit_id"})

#### NEED TO TEST THIS LATER TO MAKE SURE IT'S CORRECT ####

In [59]:
# if the smallest unit is a discussion, make the name of the new column the smallestunit_discussion_id. Otherwise, keep it as smallestunit_id

def smallestunit_name(smallestunit_type,smallestunit_id,smallestunit_discussion_id):
    if smallestunit_type == 'discussion':
        value = smallestunit_discussion_id
    else:
        value = smallestunit_id
    return value

course_structure_index['smallestunit_incl_dis'] = course_structure_index.apply(
    lambda x: smallestunit_name(x['smallestunit_type']
                                ,x['smallestunit_id']
                                ,x['smallestunit_discussion_id']), axis = 1)


In [60]:
# drop unused columns
course_structure_index = course_structure_index.drop(['smallestunit_id','smallestunit_discussion_id','smallestunit_type'],axis=1)

# rename column 
course_structure_index = course_structure_index.rename(columns={"smallestunit_incl_dis":"smallestunit_id"})

In [61]:
# create column that has the id type 
def find_id_level(sequential_id,vertical_id,smallestunit_id):
    if pd.notnull(smallestunit_id):
        value = "smallestunit_id"
    elif pd.notnull(vertical_id):
        value = "vertical_id"
    elif pd.notnull(sequential_id):
        value = "sequential_id"
    else:
        value = "chapter_id"
    return value 

testing_merge['id_level'] = testing_merge.apply(lambda x: find_id_level(
    x['sequential_id'],x['vertical_id'],x['smallestunit_id']), axis=1)

In [62]:
# groupby IDs
chapter_id_gb = course_structure_index.groupby('chapter_id', as_index = False).first()[
    ['chapter_id','chapter_order','chapter_display_name', 'chapter_start',
     'course_end','course_enrollment_end','course_start']]

sequential_id_gb = course_structure_index.groupby(
    ['chapter_id','sequential_id'], as_index = False).first()[
    ['chapter_id','chapter_order','chapter_display_name', 'chapter_start',
     'sequential_id','sequential_order','sequential_display_name',
     'course_end','course_enrollment_end','course_start']]

vertical_id_gb = course_structure_index.groupby(
    ['chapter_id','sequential_id','vertical_id'], as_index = False).first()[
    ['chapter_id','chapter_order','chapter_display_name', 'chapter_start',
     'sequential_id','sequential_order','sequential_display_name',
     'vertical_id','vertical_order','vertical_display_name',
     'course_end','course_enrollment_end','course_start']]


smallestunit_id_gb = course_structure_index.groupby(
    ['chapter_id','sequential_id','vertical_id','smallestunit_id'], as_index = False).first()[
    ['chapter_id','chapter_order', 'chapter_display_name', 'chapter_start',
     'sequential_id','sequential_order','sequential_display_name',
     'vertical_id','vertical_order','vertical_display_name',
     'smallestunit_id','smallestunit_order','smallestunit_display_name',
     'course_end','course_enrollment_end','course_start']]

## Final Merge

In [63]:
# separate testing_merge file by the id_level column 
chapter_id_events = testing_merge[testing_merge['id_level'] == 'chapter_id']
sequential_id_events = testing_merge[testing_merge['id_level'] == 'sequential_id']
vertical_id_events = testing_merge[testing_merge['id_level'] == 'vertical_id']
smallestunit_id_events = testing_merge[testing_merge['id_level'] == 'smallestunit_id']

In [64]:
# merge each file based on the id_levels 

concat1 = chapter_id_gb.merge(chapter_id_events, how = 'inner', on = 'chapter_id')
concat2 = sequential_id_gb.merge(sequential_id_events.drop(
    ['chapter_id'],axis=1), how = 'inner', on = 'sequential_id')
concat3 = vertical_id_gb.merge(vertical_id_events.drop(
    ['chapter_id','sequential_id'],axis=1), how = 'inner', on = 'vertical_id')
concat4 = smallestunit_id_gb.merge(smallestunit_id_events.drop(
    ['chapter_id','sequential_id','vertical_id'],axis=1), how = 'inner', on = 'smallestunit_id')

In [65]:
concat_all = pd.concat([concat1, concat2, concat3, concat4])
concat_all.to_csv(output_file, index = False)

In [66]:
concat_all.to_csv(output_file, index = False)