# Python for Data Journalists MOOC participant analysis

By Ben Welsh

## Import Python tools

In [114]:
import bs4
import pandas as pd

In [115]:
pd.options.display.max_rows = None

### Read in and parse student roster

The source data file has been omitted from this repository to protect the privacy of students.

In [116]:
html = open("./input/PDJ0517_ Participants.html", 'r').read()

In [117]:
soup = bs4.BeautifulSoup(html, "html5lib")

In [118]:
table = soup.find("table", id="participants")

In [119]:
raw_rows = table.find_all("tr")

In [120]:
def parse_row(row):
    cells = row.find_all("td")
    data = {}
    data['image'] = cells[1].img['src']
    data['detail_url'] = cells[2].a['href']
    data['name'] = cells[2].a.text
    data['city'] = cells[3].text
    data['country'] = cells[4].text
    return data

In [121]:
parsed_rows = [parse_row(r) for r in raw_rows[1:] if r['class'] != [u'emptyrow',]]

### Load the roster in to pandas for analysis 

In [109]:
df = pd.DataFrame(parsed_rows)

### How many students are there?

In [122]:
len(df)

1792

### Which countries have sent the most students?

In [110]:
countries = df.groupby("country").agg(dict(detail_url="count")).reset_index()

In [111]:
df.country.describe()

count              1792
unique              111
top       United States
freq                669
Name: country, dtype: object

In [112]:
countries.sort_values("detail_url", ascending=False).to_csv(
    "output/mooc-top-countries.csv",
    index=False,
    encoding="utf-8"
)

In [113]:
countries.sort_values("detail_url", ascending=False)

Unnamed: 0,country,detail_url
107,United States,669
13,Brazil,235
92,Spain,82
106,United Kingdom,65
61,Mexico,49
17,Canada,48
4,Argentina,43
43,India,34
34,Germany,29
5,Australia,20


### Which locations within countries have sent the most students?

In [123]:
locations = countries = df.groupby("city").agg(dict(detail_url="count")).reset_index()

In [125]:
df.city.describe()

count     1792
unique     757
top           
freq       339
Name: city, dtype: object

In [127]:
locations.sort_values("detail_url", ascending=False)

Unnamed: 0,city,detail_url
0,,339
603,São Paulo,35
414,New York,27
337,London,27
353,Madrid,23
510,Rio de Janeiro,23
43,Austin,16
343,Los Angeles,15
537,San Francisco,15
107,Buenos Aires,14
