In [1]:
import csv

In [2]:
# enrollments = []

# with open('enrollments.csv','rt', encoding="utf8") as yoyo:
#     reader = csv.reader(yoyo)
    
#     for row in reader:
#         print(row)

In [3]:
enrollments = []

with open('enrollments.csv','rt') as yoyo:
    reader = csv.DictReader(yoyo)
    
    for row in reader:
        enrollments.append(row)
#         print(row)

        
# reader.close()

In [4]:
enrollments[0]

{'account_key': '448',
 'status': 'canceled',
 'join_date': '2014-11-10',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_udacity': 'True',
 'is_canceled': 'True'}

In [5]:
with open('enrollments.csv', 'rt') as f:
    reader = csv.DictReader(f)
    enrollments = list(reader)

In [6]:
enrollments[0]

{'account_key': '448',
 'status': 'canceled',
 'join_date': '2014-11-10',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_udacity': 'True',
 'is_canceled': 'True'}

## reading csv files

In [7]:
def read_csv(filename):
    with open(filename, 'rt') as f:
        reader = csv.DictReader(f)
        return list(reader)

In [8]:
daily_engagement = read_csv('daily_engagement.csv')

In [9]:
daily_engagement[0]

{'acct': '0',
 'utc_date': '2015-01-09',
 'num_courses_visited': '1.0',
 'total_minutes_visited': '11.6793745',
 'lessons_completed': '0.0',
 'projects_completed': '0.0'}

In [10]:
project_submissions = read_csv('project_submissions.csv')

In [11]:
project_submissions[0]

{'creation_date': '2015-01-14',
 'completion_date': '2015-01-16',
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

## parsing date and integers

In [12]:
from datetime import datetime as dt

# this takes a string as input and returns python datetime object as output.
# if string is empty it returns None.
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
#takes a string, returns integer

def parse_maybe_int(n):
    if n == '':
        return None
    else:
        return int(n)


In [13]:
print(parse_date("2015-10-20"))

2015-10-20 00:00:00


In [14]:
parse_maybe_int("13")

13

In [15]:
for enrollment in enrollments:
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'

In [16]:
enrollments[0]

{'account_key': '448',
 'status': 'canceled',
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_udacity': True,
 'is_canceled': True}

In [17]:
for x in daily_engagement:
    x['num_courses_visited'] = int(float(x['num_courses_visited']))
    x['total_minutes_visited'] = float(x['total_minutes_visited'])
    x['lessons_completed'] = int(float(x['lessons_completed']))
    x['projects_completed'] = int(float(x['projects_completed']))
    x['utc_date'] = parse_date(x['utc_date'])
    

In [18]:
daily_engagement[0]

{'acct': '0',
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0),
 'num_courses_visited': 1,
 'total_minutes_visited': 11.6793745,
 'lessons_completed': 0,
 'projects_completed': 0}

In [19]:
for y in project_submissions:
    y['creation_date'] = parse_date(y['creation_date'])
    y['completion_date'] = parse_date(y['completion_date'])

In [20]:
project_submissions[0]

{'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

***changing the column name to generalise things***

In [21]:
for a_record in daily_engagement:
    a_record['account_key'] = a_record['acct']
    del[a_record['acct']]
    

***to get unique no of rows***

In [22]:
def get_unique_students(data):
    unique_students= set()
    for record in data:
        unique_students.add(record['account_key'])
    return unique_students
        

In [23]:

enrollment_num_rows = len(enrollments) 
enrollment_num_unique_students = len(get_unique_students(enrollments))


engagement_num_rows = len(daily_engagement)
engagement_num_unique_students = len(get_unique_students(daily_engagement))  


submission_num_rows = len(project_submissions)
submission_num_unique_students = len(get_unique_students(project_submissions))

In [24]:
daily_engagement[0]['account_key']

'0'

In [25]:
for student in enrollments:
    if student['account_key'] not in get_unique_students(daily_engagement):
        print(student)
        break

{'account_key': '1219', 'status': 'canceled', 'join_date': datetime.datetime(2014, 11, 12, 0, 0), 'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), 'days_to_cancel': 0, 'is_udacity': False, 'is_canceled': True}


In [26]:
# for enrollment in enrollments:
#     student = enrollment['account_key']
#     if student not in unique_engagement_students:
#         print(enrollment)
#         break

unique_engagement_students = get_unique_students(daily_engagement)

In [27]:
num_problem_students = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if (student not in unique_engagement_students and 
            enrollment['join_date'] != enrollment['cancel_date']):
        print(enrollment)
        num_problem_students += 1

num_problem_students

{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'days_to_cancel': 59, 'is_udacity': True, 'is_canceled': True}
{'account_key': '1304', 'status': 'canceled', 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'days_to_cancel': 99, 'is_udacity': True, 'is_canceled': True}
{'account_key': '1101', 'status': 'current', 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'cancel_date': None, 'days_to_cancel': None, 'is_udacity': True, 'is_canceled': False}


3

In [28]:
udacity_test_accounts = set()

for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [29]:
# remove udacity accounts
non_udacity_enrollments = set()
unique_enrollment = get_unique_students(enrollments)

def reomve_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [30]:
non_udacity_enrollments = reomve_udacity_accounts(enrollments)
non_udacity_engagement = reomve_udacity_accounts(daily_engagement)
non_udacity_submissions = reomve_udacity_accounts(project_submissions)

print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))


1622
135656
3634


In [31]:
paid_students = {}

In [32]:
for a_student in enrollments:
    if (not a_student['is_canceled'] and (a_student['days_to_cancel'] == None or a_student['days_to_cancel']>7)):
        paid_students = dict.fromkeys(a_student['account_key'], a_student['join_date'])

In [33]:
paid_students

{'6': datetime.datetime(2015, 8, 23, 0, 0),
 '8': datetime.datetime(2015, 8, 23, 0, 0)}

In [34]:
paid_students = {}
for enrollment in non_udacity_enrollments:
    if (not enrollment['is_canceled'] or
            enrollment['days_to_cancel'] > 7):
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        if (account_key not in paid_students or
                enrollment_date > paid_students[account_key]):
            paid_students[account_key] = enrollment_date
len(paid_students)

995

In [35]:
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >=0

In [36]:
def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

In [37]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))

1293
134549
3618


In [62]:
for engagement_record in paid_engagement:
    if engagement_record['num_courses_visited'] > 0:
        engagement_record['has_visited'] = 1
    else:
        engagement_record['has_visited'] = 0

In [63]:
paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']

    if within_one_week(join_date, engagement_record_date):
         paid_engagement_in_first_week.append(engagement_record)

len(paid_engagement_in_first_week)

6919

# Group Data

In [64]:
from collections import defaultdict

def group_data(data, key_name):
    grouped_data = defaultdict(list)

    for data_point in data:
        key = data_point[key_name]
        grouped_data[key].append(data_point)
    return grouped_data

engagement_by_account = group_data(paid_engagement_in_first_week,
                                   'account_key')

# len(engagement_by_account)

In [65]:
from collections import defaultdict

engagement_by_account = defaultdict(list)

for engagement_record in paid_engagement_in_first_week:
    account_key = engagement_record['account_key']
    engagement_by_account[account_key].append(engagement_record)
    
# len(engagement_by_account)

# Sum Grouped Items

In [66]:

def sum_grouped_items(data, field_name):
    summed_data = {}
    
    for key, data_point in data.items():
        total = 0
        for data_value in data_point:
            total += data_value[field_name]
            summed_data[key] = total
    return summed_data
    
    
total_minutes_by_account = sum_grouped_items(engagement_by_account,
                                             'total_minutes_visited')   

# len(total_minutes_by_account)

In [67]:
total_minutes_by_account = {}

for account_key, engagement_for_student in engagement_by_account.items():
    total_minutes = 0
    for engagement_record in engagement_for_student:
        total_minutes += engagement_record['total_minutes_visited']
        total_minutes_by_account[account_key] = total_minutes

        
# len(total_minutes_by_account)

In [68]:
total_minutes = list(total_minutes_by_account.values())

# print(total_minutes)


import numpy as np 

np.mean(total_minutes)

306.70832675342825

# Describing Data

In [69]:
import numpy as np

def describe_data(data):
    print('Mean:', np.mean(data))
    print('Standard deviation:', np.std(data))
    print('Minimum:', np.min(data))
    print('Maximum:', np.max(data))

# describe_data(total_minutes)
describe_data(total_minutes)

Mean: 306.70832675342825
Standard deviation: 412.99693340852957
Minimum: 0.0
Maximum: 3564.7332644989997


In [70]:
student_with_max_minutes = None
max_minutes = 0

for student, total_minutes in total_minutes_by_account.items():
    if total_minutes > max_minutes:
        max_minutes = total_minutes
        student_with_max_minutes = student
        
max_minutes

3564.7332644989997

In [71]:
for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] == student_with_max_minutes:
        print(engagement_record)

{'utc_date': datetime.datetime(2015, 7, 9, 0, 0), 'num_courses_visited': 4, 'total_minutes_visited': 850.519339666, 'lessons_completed': 4, 'projects_completed': 0, 'account_key': '163', 'has_visited': 1}
{'utc_date': datetime.datetime(2015, 7, 10, 0, 0), 'num_courses_visited': 6, 'total_minutes_visited': 872.633923334, 'lessons_completed': 6, 'projects_completed': 0, 'account_key': '163', 'has_visited': 1}
{'utc_date': datetime.datetime(2015, 7, 11, 0, 0), 'num_courses_visited': 2, 'total_minutes_visited': 777.018903666, 'lessons_completed': 6, 'projects_completed': 0, 'account_key': '163', 'has_visited': 1}
{'utc_date': datetime.datetime(2015, 7, 12, 0, 0), 'num_courses_visited': 1, 'total_minutes_visited': 294.568774, 'lessons_completed': 2, 'projects_completed': 0, 'account_key': '163', 'has_visited': 1}
{'utc_date': datetime.datetime(2015, 7, 13, 0, 0), 'num_courses_visited': 3, 'total_minutes_visited': 471.2139785, 'lessons_completed': 1, 'projects_completed': 0, 'account_key': '

In [72]:
total_lessons_completed_by_account = {}

for account_key, engagement_for_student in engagement_by_account.items():
    total_lessons_completed = 0
    for engagement_record in engagement_for_student:
        total_lessons_completed += engagement_record['lessons_completed']
        total_lessons_completed_by_account[account_key] = total_lessons_completed



In [73]:
total_lessons = list(total_lessons_completed_by_account.values())


In [74]:
print('Mean:', np.mean(total_lessons))
print('Standard deviation:', np.std(total_lessons))
print('Minimum:', np.min(total_lessons))
print('Maximum:', np.max(total_lessons))

Mean: 1.636180904522613
Standard deviation: 3.002561299829423
Minimum: 0
Maximum: 36


In [75]:
# data_by_account = group_data(paid_engagement_in_first_week,
#                                                  'account_key')

lessons_completeed_by_account = sum_grouped_items(engagement_by_account, 'lessons_completed')

describe_data(list(lessons_completed_by_account.values()))

Mean: 960.1849246231155
Standard deviation: 496.37035530134074
Minimum: 1
Maximum: 1628


In [76]:
days_visited_by_account = sum_grouped_items(engagement_by_account, 'has_visited')

describe_data(list(days_visited_by_account.values()))

Mean: 2.8673366834170855
Standard deviation: 2.2551980029196814
Minimum: 0
Maximum: 7


In [None]:
# passing_engagement
# non_passing_engagement

In [77]:
paid_submissions[0]

{'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'assigned_rating': 'UNGRADED',
 'account_key': '256',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [81]:
subway_project_keys = ['746169184','3176718735']

In [83]:
pass_subway_projects = set()

for paid_submission in paid_submissions:
    project = paid_submission['lesson_key']
    rating = paid_submission['assigned_rating']
    if project in subway_project_keys and (rating == 'PASSED' or rating == 'DISTINCTION'):
        pass_subway_projects.add(paid_submission['account_key'])
        
len(pass_subway_projects)

647

In [84]:
passing_engagement = []
non_passing_engagement = []

for engagement_record in paid_engagement_in_first_week:
    if engagement_record['account_key'] in pass_subway_projects:
        passing_engagement.append(engagement_record)
    else:
        non_passing_engagement.append(engagement_record)
        
print(len(passing_engagement))
print(len(non_passing_engagement))

4527
2392


In [91]:
passing_engagement

[{'utc_date': datetime.datetime(2015, 1, 9, 0, 0),
  'num_courses_visited': 1,
  'total_minutes_visited': 11.6793745,
  'lessons_completed': 0,
  'projects_completed': 0,
  'account_key': '0',
  'has_visited': 1},
 {'utc_date': datetime.datetime(2015, 1, 10, 0, 0),
  'num_courses_visited': 2,
  'total_minutes_visited': 37.2848873333,
  'lessons_completed': 0,
  'projects_completed': 0,
  'account_key': '0',
  'has_visited': 1},
 {'utc_date': datetime.datetime(2015, 1, 11, 0, 0),
  'num_courses_visited': 2,
  'total_minutes_visited': 53.6337463333,
  'lessons_completed': 0,
  'projects_completed': 0,
  'account_key': '0',
  'has_visited': 1},
 {'utc_date': datetime.datetime(2015, 1, 12, 0, 0),
  'num_courses_visited': 1,
  'total_minutes_visited': 33.4892696667,
  'lessons_completed': 0,
  'projects_completed': 0,
  'account_key': '0',
  'has_visited': 1},
 {'utc_date': datetime.datetime(2015, 1, 13, 0, 0),
  'num_courses_visited': 1,
  'total_minutes_visited': 64.7796776667,
  'lessons

In [None]:
passing_students_minutes_spent = []
passing_students_lessons_completed = []


for record in passing_engagement_by_account:
    passing_students_minutes_spent.append(record['total_minutes_visited'])
    passing_students_lessons_completed.append(record['lessons_completed'])