# Load Data From CSVs

In [64]:
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

#print enrollments[0]
#print daily_engagement[0]
#print project_submissions[0]

## Fixing Data Types

In [65]:
from datetime import datetime as dt

# Takes a date as a string, and returns a Python datetime object. 
# If there is no date given, returns None
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
# Takes a string which is either an empty string or represents an integer,
# and returns an int or None.
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

# Clean up the data types in the enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
#enrollments[0]

In [66]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])
    
#daily_engagement[0]

In [67]:
# Clean up the data types in the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

#project_submissions[0]

Note when running the above cells that we are actively changing the contents of our data variables. If you try to run these cells multiple times in the same session, an error will occur.

## Investigating the Data

In [68]:
#####################################
#                 2                 #
#####################################

## Find the total number of rows and the number of unique students (account keys)
## in each table.

In [69]:
# del daily_engagement['acct'] would remove the whole column, incl the data, so we
# want to replace each name in each row
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del[engagement_record['acct']] 

In [70]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [71]:
len(enrollments)

1640

In [72]:
#set() gives unique, unordered elements
#enrollment goes through each row of "enrollments" to find only the unique stuent numbers
unique_enrolled_students = get_unique_students(enrollments)
    
len(unique_enrolled_students)

1302

In [73]:
len(daily_engagement)

136240

In [74]:
unique_engagement_students = get_unique_students(daily_engagement)

len(unique_engagement_students)

1237

In [75]:
len(project_submissions)

3642

In [76]:
unique_project_submitters = get_unique_students(project_submissions)
    
len(unique_project_submitters)

743

## Problems in the Data

In [77]:
#####################################
#                 3                 #
#####################################

## Rename the "acct" column in the daily_engagement table to "account_key".

In [78]:
# del daily_engagement['acct'] would remove the whole column, incl the data, so we
# want to replace each name in each row
#for engagement_record in daily_engagement:
    #engagement_record['account_key'] = engagement_record['acct']
    #del[engagement_record['acct']]   
    
#THIS CODE WAS MOVED UP TO THE TOP OF THE LAST STEP FOR FUTURE USE

## Missing Engagement Records

In [None]:
#####################################
#                 4                 #
#####################################

## Find any one student enrollments where the student is missing from the daily engagement table.
## Output that enrollment.

## Checking for More Problem Records

In [None]:
#####################################
#                 5                 #
#####################################

## Find the number of surprising data points (enrollments missing from
## the engagement table) that remain, if any.