# Python for Data Journalists MOOC participant analysis

By Ben Welsh

## Import Python tools

In [68]:
import bs4
import pandas as pd
from iso3166 import countries as iso3166

In [69]:
pd.options.display.max_rows = None

### Read in and parse student roster

The source data file has been omitted from this repository to protect the privacy of students.

In [70]:
html = open("./input/PDJ0517_ Participants.html", 'r').read()

In [71]:
soup = bs4.BeautifulSoup(html, "html5lib")

In [72]:
table = soup.find("table", id="participants")

In [73]:
raw_rows = table.find_all("tr")

In [74]:
def parse_row(row):
    cells = row.find_all("td")
    data = {}
    data['image'] = cells[1].img['src']
    data['detail_url'] = cells[2].a['href']
    data['name'] = cells[2].a.text
    data['city'] = cells[3].text
    data['country'] = cells[4].text
    return data

In [75]:
parsed_rows = [parse_row(r) for r in raw_rows[1:] if r['class'] != [u'emptyrow',]]

### Load the roster in to pandas for analysis 

In [76]:
df = pd.DataFrame(parsed_rows)

### How many students are there?

In [77]:
len(df)

2740

### Which countries have sent the most students?

In [78]:
countries = df.groupby("country").agg(dict(detail_url="count")).reset_index()

In [79]:
df.country.describe()

count              2740
unique              123
top       United States
freq               1064
Name: country, dtype: object

In [80]:
countries.sort_values("detail_url", ascending=False).to_csv(
    "output/mooc-top-countries.csv",
    index=False,
    encoding="utf-8"
)

In [81]:
countries.columns = ['name', 'total']

In [82]:
countries['percent'] = countries.total / countries.total.sum()

In [83]:
def get_code(name):
    try:
        return iso3166.get(name.strip()).alpha2
    except KeyError:
        if name == 'United Kingdom':
            return iso3166.get("GB").alpha2
        return ''

In [84]:
countries['code'] = countries.name.apply(get_code)

In [85]:
countries.sort_values("total", ascending=False).head(20)

Unnamed: 0,name,total,percent,code
118,United States,1064,0.388321,US
17,Brazil,319,0.116423,BR
101,Spain,104,0.037956,ES
117,United Kingdom,87,0.031752,GB
68,Mexico,81,0.029562,MX
49,India,70,0.025547,IN
40,Germany,63,0.022993,DE
22,Canada,62,0.022628,CA
4,Argentina,52,0.018978,AR
77,Nigeria,39,0.014234,NG


### Which locations within countries have sent the most students?

In [86]:
locations = df.groupby("city").agg(dict(detail_url="count")).reset_index()

In [87]:
df.city.describe()

count     2740
unique    1032
top           
freq       513
Name: city, dtype: object

In [88]:
locations.columns = ["name", "total"]

In [89]:
locations['percent'] = locations.total / locations.total.sum()

In [90]:
locations.sort_values("total", ascending=False).head(20)

Unnamed: 0,name,total,percent
0,,513,0.187226
817,São Paulo,46,0.016788
467,London,41,0.014964
579,New York,41,0.014964
487,Madrid,33,0.012044
476,Los Angeles,31,0.011314
697,Rio de Janeiro,27,0.009854
56,Austin,27,0.009854
896,Washington,22,0.008029
735,San Francisco,22,0.008029
