# Python for Data Journalists MOOC participant analysis

By Ben Welsh

## Import Python tools

In [21]:
import bs4
import pandas as pd

In [22]:
pd.options.display.max_rows = None

### Read in and parse student roster

The source data file has been omitted from this repository to protect the privacy of students.

In [23]:
html = open("./input/PDJ0517_ Participants.html", 'r').read()

In [24]:
soup = bs4.BeautifulSoup(html, "html5lib")

In [25]:
table = soup.find("table", id="participants")

In [27]:
raw_rows = table.find_all("tr")

In [28]:
def parse_row(row):
    cells = row.find_all("td")
    data = {}
    data['image'] = cells[1].img['src']
    data['detail_url'] = cells[2].a['href']
    data['name'] = cells[2].a.text
    data['city'] = cells[3].text
    data['country'] = cells[4].text
    return data

In [29]:
parsed_rows = [parse_row(r) for r in raw_rows[1:] if r['class'] != [u'emptyrow',]]

### Load the roster in to pandas for analysis 

In [30]:
df = pd.DataFrame(parsed_rows)

### How many students are there?

In [32]:
len(df)

2541

### Which countries have sent the most students?

In [33]:
countries = df.groupby("country").agg(dict(detail_url="count")).reset_index()

In [35]:
df.country.describe()

count              2541
unique              122
top       United States
freq               1005
Name: country, dtype: object

In [36]:
countries.sort_values("detail_url", ascending=False).to_csv(
    "output/mooc-top-countries.csv",
    index=False,
    encoding="utf-8"
)

In [37]:
countries.sort_values("detail_url", ascending=False)

Unnamed: 0,country,detail_url
118,United States,1005
17,Brazil,285
101,Spain,99
117,United Kingdom,83
68,Mexico,77
49,India,66
22,Canada,58
40,Germany,52
4,Argentina,49
77,Nigeria,32


### Which locations within countries have sent the most students?

In [38]:
locations = df.groupby("city").agg(dict(detail_url="count")).reset_index()

In [39]:
df.city.describe()

count     2541
unique     976
top           
freq       489
Name: city, dtype: object

In [40]:
locations.sort_values("detail_url", ascending=False)

Unnamed: 0,city,detail_url
0,,489
771,São Paulo,43
540,New York,39
432,London,38
452,Madrid,31
441,Los Angeles,29
51,Austin,24
654,Rio de Janeiro,23
843,Washington,21
690,San Francisco,20
