In [1]:
import unicodecsv

In [2]:
def read_csv(filename):
    with open(filename,'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

### Importing data of enrollments.csv

In [3]:
enrollments = read_csv('./csv/enrollments.csv')

In [4]:
type(enrollments)

list

In [5]:
enrollments[0]

{'account_key': '448',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2014-11-10',
 'status': 'canceled'}

### Importing data of daily engagement

In [7]:
engagements = read_csv('./csv/daily_engagement.csv')
engagements[0]

{'acct': '0',
 'lessons_completed': '0.0',
 'num_courses_visited': '1.0',
 'projects_completed': '0.0',
 'total_minutes_visited': '11.6793745',
 'utc_date': '2015-01-09'}

In [10]:
project_submissions = read_csv('./csv/project_submissions.csv')
project_submissions[0]

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

### datatime library 활용하기

In [13]:
from datetime import datetime as dt

def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [14]:
for enrollment in enrollments:
    
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])
    
enrollments[0]

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [16]:
daily_engagement = read_csv('./csv/daily_engagement.csv')

In [17]:
daily_engagement[0]

{'acct': '0',
 'lessons_completed': '0.0',
 'num_courses_visited': '1.0',
 'projects_completed': '0.0',
 'total_minutes_visited': '11.6793745',
 'utc_date': '2015-01-09'}

In [18]:
# Clean up the data types in the engagement table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))

In [15]:
for submission in project_submissions:
    submission['creation_date'] = parse_date(submission['creation_date'])
    submission['completion_date'] = parse_date(submission['completion_date'])
    
project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

In [16]:
project_submissions

[OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
              ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
              ('assigned_rating', 'UNGRADED'),
              ('account_key', '256'),
              ('lesson_key', '3176718735'),
              ('processing_state', 'EVALUATED')]),
 OrderedDict([('creation_date', datetime.datetime(2015, 1, 10, 0, 0)),
              ('completion_date', datetime.datetime(2015, 1, 13, 0, 0)),
              ('assigned_rating', 'INCOMPLETE'),
              ('account_key', '256'),
              ('lesson_key', '3176718735'),
              ('processing_state', 'EVALUATED')]),
 OrderedDict([('creation_date', datetime.datetime(2015, 1, 20, 0, 0)),
              ('completion_date', datetime.datetime(2015, 1, 20, 0, 0)),
              ('assigned_rating', 'PASSED'),
              ('account_key', '256'),
              ('lesson_key', '3176718735'),
              ('processing_state', 'EVALUATED')]),
 OrderedDict([('creation_dat

In [17]:
len(enrollments)

1640

In [20]:
unique_enrolled_students = set() 

for enrollment in enrollments:
    unique_enrolled_students.add(enrollment['account_key'])

len(unique_enrolled_students)

1302

In [None]:
unique_engagement_students = set()

for enrollment in daily_engagement:
    unique_engagement_students.add(enrollment['acct'])

len(unique_engagement_students)

In [22]:
len(project_submissions)

3642

In [23]:
unique_project_submitters = set()

for submission in project_submissions:
    unique_project_submitters.add(submission['account_key'])
len(unique_project_submitters)

743

### 딕셔너리의 특정 컬럼명 변경하기

In [None]:
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del(engagement_record['acct'])

In [24]:
dict_profile = {'name':'Jason Nam', 'age':'39', 'gen':'male'}

In [25]:
dict_profile['gender'] = dict_profile['gen']

In [26]:
dict_profile

{'age': '39', 'gen': 'male', 'gender': 'male', 'name': 'Jason Nam'}

In [27]:
del(dict_profile['gen'])

In [28]:
dict_profile

{'age': '39', 'gender': 'male', 'name': 'Jason Nam'}

In [None]:
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [None]:
daily_engagement[0]['account_key']