## Setup

In [1]:
### load the required packages
import numpy as np 
import scipy as sp 
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

In [2]:
### Load the dataset 
df = pd.read_csv('D:/Dataset/Side_project_edx_analysis/appendix.csv')

print(df.head())
print(df.shape)

  Institution Course Number Launch Date  \
0        MITx        6.002x  09/05/2012   
1        MITx         6.00x  09/26/2012   
2        MITx        3.091x  10/09/2012   
3    HarvardX         CS50x  10/15/2012   
4    HarvardX        PH207x  10/15/2012   

                                        Course Title  \
0                           Circuits and Electronics   
1   Introduction to Computer Science and Programming   
2              Introduction to Solid State Chemistry   
3                   Introduction to Computer Science   
4  Health in Numbers: Quantitative Methods in Cli...   

                                         Instructors  \
0                                     Khurram Afridi   
1            Eric Grimson, John Guttag, Chris Terman   
2                                       Michael Cima   
3  David Malan, Nate Hardison, Rob Bowden, Tommy ...   
4                 Earl Francis Cook, Marcello Pagano   

                                      Course Subject  Year  \
0  Sc

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290 entries, 0 to 289
Data columns (total 23 columns):
Institution                                     290 non-null object
Course Number                                   290 non-null object
Launch Date                                     290 non-null object
Course Title                                    290 non-null object
Instructors                                     289 non-null object
Course Subject                                  290 non-null object
Year                                            290 non-null int64
Honor Code Certificates                         290 non-null int64
Participants (Course Content Accessed)          290 non-null int64
Audited (> 50% Course Content Accessed)         290 non-null int64
Certified                                       290 non-null int64
% Audited                                       290 non-null float64
% Certified                                     290 non-null float64
% Certified of

## EDA

#### Transform the Launch Date as datetime object

In [4]:
from datetime import datetime

In [5]:
df['Launch Date'] = df['Launch Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### Explore the trend of Computer Science

In [6]:
df['Computer_science'] = df['Course Subject'].apply(lambda x: int('Computer Science' in x))

In [7]:
df['Year'] = df['Launch Date'].apply(lambda x: x.year)

In [8]:
df_by_yr = df.groupby(['Year'])

In [9]:
### Calculate the percentage of course relared to Computer Science
com_ratio = df_by_yr['Computer_science'].sum()/df_by_yr['Computer_science'].count()

In [10]:
### Visualize
trace_1 = go.Scatter(
    mode = 'lines',
    x = list(com_ratio.index),
    y = list(com_ratio.values),
    name = 'Ratio',
    line = dict(color = '#006400'))

layout = go.Layout(
    title = 'Ratio of Computer Science Related Courses',
    xaxis = dict(dtick=1, title = 'Year'),
    yaxis = dict(title = 'Ratio'))

data = [trace_1]

fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

#### What courses do he and she take?

For male:

In [11]:
male = df.sort_values(['% Male'], ascending=False).ix[:, ['Course Title', 'Course Subject', '% Male']].head(10).copy(deep=True)
male.index = range(1, 11)
male

Unnamed: 0,Course Title,Course Subject,% Male
1,Introduction to State Space Control,"Science, Technology, Engineering, and Mathematics",93.44
2,Introduction to Feedback Control Theory,"Science, Technology, Engineering, and Mathematics",92.54
3,Underactuated Robotics,"Science, Technology, Engineering, and Mathematics",91.97
4,Flight Vehicle Aerodynamics,"Science, Technology, Engineering, and Mathematics",91.48
5,Discrete-Time Signal Processing,"Science, Technology, Engineering, and Mathematics",91.4
6,Introduction to Control System Design,"Science, Technology, Engineering, and Mathematics",91.17
7,Mastering Quantum Mechanics: Entanglement and ...,"Science, Technology, Engineering, and Mathematics",91.1
8,Underactuated Robotics,"Science, Technology, Engineering, and Mathematics",91.04
9,Flight Vehicle Aerodynamics,"Science, Technology, Engineering, and Mathematics",89.61
10,"Circuits and Electronics: Amplification, Speed...","Science, Technology, Engineering, and Mathematics",89.55


For female:

In [12]:
female = df.sort_values(['% Female'], ascending=False).ix[:, ['Course Title', 'Course Subject', '% Female']].head(10).copy(deep=True)
female.index = range(1, 11)
female

Unnamed: 0,Course Title,Course Subject,% Female
1,Introduction to Family Engagement in Education,"Humanities, History, Design, Religion, and Edu...",74.76
2,Poetry in America: Emily Dickinson,"Humanities, History, Design, Religion, and Edu...",66.74
3,History of the Book: Book Sleuthing: What 19th...,"Humanities, History, Design, Religion, and Edu...",66.62
4,Tangible Things,"Humanities, History, Design, Religion, and Edu...",65.69
5,Introduction to Data Wise: A Collaborative Pro...,"Humanities, History, Design, Religion, and Edu...",64.69
6,World Religions Through their Scriptures: Reli...,"Humanities, History, Design, Religion, and Edu...",63.04
7,Practical Improvement Science in Health Care: ...,"Government, Health, and Social Science",62.76
8,Tangible Things,"Humanities, History, Design, Religion, and Edu...",62.62
9,History of the Book: The Medieval Book of Hour...,"Humanities, History, Design, Religion, and Edu...",62.09
10,History of the Book: The History of the Book i...,"Humanities, History, Design, Religion, and Edu...",61.96


#### Variation by course subject

In [13]:
df_by_sub = df.groupby(['Course Subject'])

In [14]:
df['Course Subject'].value_counts() # there are 4 subjects

Humanities, History, Design, Religion, and Education    94
Science, Technology, Engineering, and Mathematics       91
Government, Health, and Social Science                  75
Computer Science                                        30
Name: Course Subject, dtype: int64

In [15]:
df_by_sub.groups.keys()

dict_keys(['Computer Science', 'Science, Technology, Engineering, and Mathematics', 'Humanities, History, Design, Religion, and Education', 'Government, Health, and Social Science'])

In [16]:
[list(df_by_sub['Participants (Course Content Accessed)'])[i][1].values for i in range(4)][3]

array([36105, 16663,  8270, 39178, 16752, 28535,  8736, 15350, 11961,
       35903, 24707, 60555, 21387, 18424, 26364,  7905, 11883, 27882,
        5854, 12939,  6140,  5092, 15627, 10755, 19263, 11841,  5245,
        7093, 26086, 11074,  7000,  6980,  4143, 22828, 22537,  7396,
        7008,  5448, 10932,  3661, 13525,  7723, 10626,  8088, 18266,
        6934, 31710, 48922,  5725,  4767,  6577,  6129, 18732, 38119,
        6146,  5566, 10191, 11574, 29176,  3178,  1178,  2860,  2075,
         790,  7260,   658,  8377,  5747,   639,  2456, 17354,  9788,
        3842,  8936,  1862,  3749,  3294,   510,  1236,   362,  8297,
        7898,  4825,  8017,  4014, 12218,  1645,  2335,  3937,  1431,  1294], dtype=int64)

Distribution of number of participants by each subject

In [17]:
### Visualization

subjects = list(df_by_sub.groups.keys())
colors = ['rgb(64, 59, 191)', 'rgb(18, 115, 87)', 'rgb(217, 151, 72)', 'rgb(191, 69, 69)']
xs = [list(df_by_sub['Participants (Course Content Accessed)'])[i][1].values for i in range(4)]

traces = []

for i in range(4):
    traces.append(go.Box(
        y = xs[i],
        name = subjects[i],
            marker = dict(
        color = colors[i],
    ),
        ))
    
layout = go.Layout(
    title='Distribution of Number of Participants by Subject',
    margin=dict(
        l=150,
        r=30,
        b=50,
        t=50,
    ),
    xaxis=dict(showticklabels=False)) # hide the labels for xtick
    
fig =  go.Figure(data = traces, layout = layout)
py.offline.iplot(fig)