In [None]:
csv = [['A', 'B', 'C'],
      ['1', '2', '3']]
csv1 = [{'num1':1, 'num2':2, 'num3':3},
       {'num1':5, 'num2':4, 'num3':3}]

In [None]:
print(csv)
print(csv1)

In [None]:
with open('enrollments.csv', 'r') as f:
    reader = csv.DictReader(f)
    # reader = csv.reader(f)
    enrollments = list(reader)
    # enrollments = [x for x in reader]
enrollments[0]

In [None]:
daily_engagement[0]['account_key']

# Learning Code start here

In [3]:
import csv

def csv_read(filename):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        return list(reader)

In [4]:
enrollments = csv_read('enrollments.csv')
daily_engagement = csv_read('daily_engagement.csv')
project_submissions = csv_read('project_submissions.csv')

## Data cleaning

In [5]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    try: 
        return dt.strptime(date, '%Y-%m-%d')
    except:
        return date
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    try:
        return int(i)
    except:
        return None

In [6]:
# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])

In [7]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])

In [8]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

In [9]:
# replace dict key 'acct' with key 'account_key'
for x in daily_engagement:
    x['account_key'] = x.pop('acct')  # pop items with account key 'acct' then create new one dict with new key

## Investigating

Find the account key enrolled but not engaged

In [10]:
# find number of row in tables and number unique student in table
print('enrollments with :', len(enrollments) , \
      'rows and', len(set(x['account_key'] for x in enrollments)), 'accounts')
print('daily_engagement with :', len(daily_engagement) , \
      'rows and', len(set(x['account_key'] for x in daily_engagement)), 'accounts')
print('project_submissions with :', len(project_submissions) , \
      'rows and', len(set(x['account_key'] for x in project_submissions)), 'accounts')

enrollments with : 1640 rows and 1302 accounts
daily_engagement with : 136240 rows and 1237 accounts
project_submissions with : 3642 rows and 743 accounts


In [11]:
# creaete set of unique enrollment account key and engagement account key
enroll_acct_unique = set(x['account_key'] for x in enrollments)
eng_acct_unique = set(x['account_key'] for x in daily_engagement)

In [12]:
# create set of diff account form subsetting enrollment account key with engagement account key
diff_enroll_eng_acct = enroll_acct_unique.difference(eng_acct_unique)
enroll_diff_eng = [x for x in enrollments if x['account_key'] in diff_enroll_eng_acct]

In [13]:
# count how duplication in diff account
all_diff_acct = [x['account_key'] for x in enroll_diff_eng]
for x in diff_enroll_eng_acct:
    print(x, "count", all_diff_acct.count(x))
del(all_diff_acct)

1063 count 1
884 count 1
1190 count 1
1191 count 1
1044 count 1
1241 count 1
799 count 1
654 count 3
1129 count 2
875 count 1
1101 count 1
802 count 1
727 count 1
889 count 1
1010 count 1
1237 count 1
871 count 1
878 count 1
737 count 1
1218 count 1
1304 count 2
1155 count 1
739 count 1
1079 count 1
1219 count 1
914 count 2
1186 count 1
717 count 1
997 count 1
1213 count 1
996 count 1
1284 count 1
817 count 1
819 count 2
749 count 1
1145 count 1
1069 count 1
1291 count 1
1086 count 1
968 count 1
1270 count 1
1222 count 1
728 count 1
707 count 1
1120 count 1
1238 count 1
964 count 1
750 count 1
841 count 1
766 count 1
1171 count 1
1025 count 1
902 count 1
1125 count 1
803 count 1
733 count 1
926 count 1
1273 count 1
789 count 1
981 count 1
870 count 1
1148 count 1
664 count 1
725 count 1
711 count 1


## Remove udacity test account

Excluding udacity test account, 'is_udacity' = 'True'

In [14]:
# create set of udacity test account key 
udacity_test_acct = set(x['account_key'] for x in enrollments if x['is_udacity'])

# excluding from data
non_udacity_enrollments = [x for x in enrollments if x['account_key'] not in udacity_test_acct]
non_udacity_engagement = [x for x in daily_engagement if x['account_key'] not in udacity_test_acct]
non_udacity_submissions = [x for x in project_submissions if x['account_key'] not in udacity_test_acct]

In [15]:
# check if record correct
print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))

1622
135656
3634


## Exploration

Paid student 'days_to_cancel' = None and 'days_to_cancel' > 7 (first 7 day is trial period)

In [16]:
# dict of paid_student
paid_students = dict()

for x in non_udacity_enrollments:
    
        # select not cancel , cancel > 7 (paid students)
        if x['days_to_cancel'] is None or x['days_to_cancel'] > 7:
            account_key = x['account_key']
            enrollment_date = x['join_date']
            
            # check if account_key not in paid_students then add account key and enrollment date
            # or if account_key existed in paid_students by the enrollment date more than in paid_student
            # update enrollment_date to latest
            if account_key not in paid_students or enrollment_date > paid_students[account_key]:
                paid_students[account_key] = enrollment_date

In [37]:
# create function to remove free trail cancels

def remove_free_trial_cancels(data):
    output_data = list();
    
    for x in data:
        if x['account_key'] in paid_students:
            output_data.append(x)
    return output_data

In [56]:
# create data of removed free trail , cancel account 

paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))

1293
134549
3618


In [57]:
# Add 'has_visited' in paid_engagement if num_course_visited > 0
for engagement in paid_engagement:
    engagement['has_visited'] = 1 if engagement['num_courses_visited'] > 0 else 0

In [58]:
# Takes a student's join date and the date of a specific engagement record,
# and returns True if that engagement record happened within one week
# of the student joining.
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >= 0

In [59]:
# create list paid paid_engagement_in_first_week
paid_engagement_in_first_week = []

# append data in non_udacity_engagement if criteria ma
for engagement in paid_engagement:
    account_key = engagement['account_key']
    join_date = paid_students[account_key]
    engagement_date = engagement['utc_date']
    
    if within_one_week(join_date, engagement_date):
        paid_engagement_in_first_week.append(engagement)

# check record
len(paid_engagement_in_first_week)

6919

### Exploring student engagement

Calculate total minute and average minute of engagemnet. 

In [60]:
# use defaultdict for handle missing value
from collections import defaultdict

def group_data(data, key_name):
    
    group_data = defaultdict(list)

    # group engagement in first week by dict of 'account_key' with list of engagement records
    for x in data:
        key = x[key_name]
        group_data[key].append(x)
    return group_data

engagement_by_account = group_data(paid_engagement_in_first_week, 'account_key')

In [61]:
def sum_grouped_items(grouped_data, field_name):

    # create dict of account key and sum of total minutes visited
    total_items_by_account = {}

    # dict comprehension in form of key, value 
    for key, records in grouped_data.items():
        total_items = 0
        # since value is list then do list comprehension, create list of total minutes visited, then summ
        # total_minutes = sum([x['total_minutes_visited'] for x in engagement_for_students])
        for record in records:
            total_items += record[field_name]
        total_items_by_account[key] = total_items
    return total_items_by_account

total_minutes_by_account = sum_grouped_items(engagement_by_account, 'total_minutes_visited')

In [62]:
# test code dict value comprehension to list and sum
sum([v for k,v in total_minutes_by_account.items()])/len(total_minutes_by_account)

306.7083267534284

In [63]:
import numpy as np

def describe_data(data):
    print('Mean :', np.mean(data))
    print('Std Dev :', np.std(data))
    print('Min :', np.min(data))
    print('Max :', np.max(data))

# numpy input as list , convert dict.values to list
total_minutes = list(total_minutes_by_account.values())
describe_data(total_minutes)

Mean : 306.70832675342825
Std Dev : 412.99693340852957
Min : 0.0
Max : 3564.7332644989997


### Finding error in total minutes

The total minute in one week exceed minute in one week

In [None]:
# find which account key having total minute exceeding minutes in a week (7*24*60)
[k for k,v in total_minutes_by_account.items() if v > 10000]

In [None]:
# List out the enrollment date of target students in 'paid_students'
paid_students['108']

In [None]:
# List out all the enrollment records of target students in 'non_udacity_enrollments' 
[x for x in non_udacity_enrollments if x['account_key'] == '108']

Most of target students are re-enrollments paid students.

In [None]:
# List out the paid minutes of target students in 'paid_engagement_in_first_week'
[x for x in paid_engagement_in_first_week if x['account_key'] == '108']

The 'paid_engagement_in_first_week' took the engagement records from prior enrollments in calculation.

### Exploring number of lessons completed 

In [64]:
# dict whith 
total_lesson_completed_by_account = {}

# dict comprehension to find the total lesson completed by acccount
for account_key, engagement_records in engagement_by_account.items():
    total_lesson_completed = 0
    for x in engagement_records:
        total_lesson_completed += x['lessons_completed']
    total_lesson_completed_by_account[account_key] = total_lesson_completed

In [65]:
len(total_lesson_completed_by_account)

995

In [66]:
# convert dict to list of values for numpy calculation
total_lesson = list(total_lesson_completed_by_account.values())

import numpy as np
print('Mean :', np.mean(total_lesson))
print('Std Dev :', np.std(total_lesson))
print('Min :', np.min(total_lesson))
print('Max :', np.max(total_lesson))

Mean : 1.636180904522613
Std Dev : 3.002561299829423
Min : 0
Max : 36


### Exploring number of day with course visited

In [78]:
def describe_grouped_items(data, key, field):
    
    # summarized
    dict_key_group_items = {}

    for key, items in data.items():
        sum_field = 0
        
        for item in items:
            sum_field += item[field]
        dict_key_group_items[key] = sum_field

    # print data

    grouped_data = list(dict_key_group_items.values())

    import numpy as np
    
    print('Mean :', np.mean(grouped_data))
    print('Std Dev :', np.std(grouped_data))
    print('Min :', np.min(grouped_data))
    print('Max :', np.max(grouped_data))

In [79]:
describe_grouped_items(engagement_by_account, 'account_key', 'has_visited')

Mean : 2.8673366834170855
Std Dev : 2.2551980029196814
Min : 0
Max : 7
