In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rc("figure", dpi=150, figsize=(6, 3))

# 13.1 Bitly Data from 1.USA.gov

In [None]:
path = "datasets/bitly_usagov/example.txt"

# json data
with open(path) as f:
    print(f.readline())

In [None]:
import json

# read into list of dicts
with open(path) as f:
    records = [json.loads(line) for line in f]

records[0]

## Counting Time Zones in Pure Python

In [None]:
# extract a list of time zones
time_zones = [rec["tz"] for rec in records if "tz" in rec]
time_zones[:10]

In [None]:
# absolutely pure python
ts = {}
for rec in records:
    if "tz" not in rec:
        continue
    if rec["tz"] not in ts:
        ts[rec["tz"]] = 1
    else:
        ts[rec["tz"]] += 1

# sort
ts = [(k, v) for k, v in ts.items()]
ts.sort(key=lambda x: -x[1])
print(ts)

In [None]:
# pure python for counting time zones
from collections import Counter

c = Counter(time_zones)
len(c), c["America/New_York"], len(time_zones)

In [None]:
c.most_common(10)

## Counting Time Zones with pandas

In [None]:
frame = pd.DataFrame(records)
frame.info()

In [None]:
frame["tz"].head()

In [None]:
# simply get the number of times each timezone occurs
tz_counts = frame["tz"].value_counts()
tz_counts.head(10)

In [None]:
clean_tz = frame["tz"].fillna("Missing")    # Clean na values
clean_tz[clean_tz == ""] = "Unknown"        # get rid of empty lines ("" is not na)

tz_counts = clean_tz.value_counts()
tz_counts.head(10)  # this now includes the "Missing" category, which was left out before due to pandas' way of handling missing data

In [None]:
import seaborn as sns

subset = tz_counts.head()
sns.barplot(y=subset.index, x=subset.to_numpy())

In [None]:
# "a" field contains information about the browser, device or application used to perform the URL shortening
frame["a"][1], frame["a"][50]

In [None]:
# Extract browser information
results = pd.Series([x.split()[0] for x in frame["a"].dropna()])
results.head()

In [None]:
results.value_counts().head(8)

In [None]:
cframe = frame[frame["a"].notna()].copy()   # get all non-null values
cframe["os"] = np.where(cframe["a"].str.contains("Windows"), "Windows", "Not Windows")  # determine whether the os is windows or not
cframe["os"].head()

In [None]:
by_tz_os = cframe.groupby(["tz", "os"]) # group by timezone and operating system
by_tz_os.size()

In [None]:
# size(): get the number of occurrences per category; unstack(): unstack the lowest level index and turn it into columns
# fillna(0): any missing values are filled with 0
# we now have a dataframe that tells us the number of occurrence of each time zone (index) and whether the user used Windows or not (column)
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts.head())

In [None]:
# - we first compute the sum per index (time zone)
# - argsort() then creates a new series, where indexer[i] = j means original position j would appear at sorted position i
# - so the values are the original position from the previous dataframe
# - and the position tells us where this value is now
# - the timezone index is useless, this does not mean tz[Africa/Cairo] used to be position 7!
indexer = agg_counts.sum("columns").argsort()

In [None]:
indexer[-10:]

In [None]:
# indexer is sorted in increasing fashion -> indexer[-10:] gives us the 10 highest values
count_subset = agg_counts.take(indexer[-10:])
count_subset

In [None]:
# pandas also has a convenient method
agg_counts.sum(axis="columns").nlargest(10)

In [None]:
# count_subset now contains the 10 most common timezones in increasing order
# stack() -> stack the columns above each other into rows
# the product is a series
count_subset = count_subset.stack()

# give the series column a name
count_subset.name = "total"

# reset the index: since count_subset has hierarchical index, a dataframe is created, with the index in newly created columns
count_subset = count_subset.reset_index()

sns.barplot(x="total", y="tz", hue="os", data=count_subset)

In [None]:
count_subset.head()

In [None]:
# normalize to a sum of 1 per group
def norm_total(group):
    """Calculates the relative frequency of each entry in the group compared to the group only"""
    group["normed_total"] = group["total"] / group["total"].sum()
    return group

# remember count_subset contains a timezone and os columns, and the number of entries of this combination
# we group by timezone so each group contains one entry per os
# then apply the norm_total function
# - remember that apply applies the function and tries to stack the result on top of each other
# results then contains a series with the relative frequency of each os in each time zone
results = count_subset.groupby("tz").apply(norm_total, include_groups=False)
sns.barplot(x="normed_total", y="tz", hue="os", data=results)

In [None]:
# compute normalized sum more efficiently
# same result as above
g = count_subset.groupby("tz")
results2 = count_subset["total"] / g["total"].transform("sum")
results2

# 13.1 MovieLens 1M Dataset

This dataset contains 1 million movie ratings from 6000 users on 4000 movies. Data is split across three tables into ratings, user information and movie information.

In [None]:
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_table("datasets/movielens/users.dat", sep="::",
                      header=None, names=unames, engine="python")

rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table("datasets/movielens/ratings.dat", sep="::",
                        header=None, names=rnames, engine="python")

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::",
                       header=None, names=mnames, engine="python")

In [None]:
users.head()

In [None]:
users.info()

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
movies.head()

In [None]:
movies.info()

In [None]:
# merge data, pandas infers the key to merge based on overlapping names
data = pd.merge(pd.merge(ratings, users), movies)
data

In [None]:
data.iloc[0]

In [None]:
# compute mean movie ratings for each film by gender
# mean is default method
mean_ratings = data.pivot_table("rating", index="title", columns="gender")
mean_ratings.head()

In [None]:
# select movies with more than 250 ratings
ratings_by_title = data.groupby("title").size()
ratings_by_title.head()

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

In [None]:
# since mean ratings uses the titles as index, we can select the titles now
mean_ratings = mean_ratings.loc[active_titles]

In [None]:
# we can find top-rated movies by female raters:
# by default, the output is sorted in an ascending fashion
top_female_ratings = mean_ratings.sort_values("F", ascending=False)
top_female_ratings.head()

## Measuring Rating Disagreement

Find movies with the highest disagreement between male and female watchers.

In [None]:
mean_ratings["Diff"] = mean_ratings["M"] - mean_ratings["F"]
highest_diff = mean_ratings.sort_values("Diff")
highest_diff.head()

In [None]:
# get movies preferred by men
highest_diff[::-1].head()

In [None]:
# we can compute the highest disagreement using the standard deviation
by_title = data.groupby("title")["rating"].std()

# again filter active titles
by_title = by_title.loc[active_titles]
by_title.sort_values(ascending=False)

In [None]:
# updates genres from genre1|genre2 to separate listing

# first, we split the genres into a list of strings (pop returns the column and deletes it)
movies["genre"] = movies.pop("genres").str.split("|")

# explode turns a list like into a row
movies_exploded = movies.explode("genre")
movies_exploded[:10]

In [None]:
# merge back
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)
ratings_with_genre.iloc[0]

In [None]:
# group by genre
# unstack pivots a multiindex series into a dataframe with the index being turned into columns
genre_ratings = ratings_with_genre.groupby(["genre", "age"])["rating"].mean().unstack("age")
genre_ratings

# 13.3 US Baby Names 1880 - 2010

In [None]:
!head -n 10 datasets/babynames/yob1880.txt

In [None]:
names1880 = pd.read_csv("datasets/babynames/yob1880.txt", names=["name", "sex", "births"])
names1880.head()

In [None]:
names1880.info()

In [None]:
# births per year per sex
names1880.groupby("sex")["births"].sum()

In [None]:
pieces = []
for year in range(1880, 2011):
    year_df = pd.read_csv(f"datasets/babynames/yob{year}.txt", names=["name", "sex", "births"])
    year_df["year"] = year
    pieces.append(year_df)

names = pd.concat(pieces, ignore_index=True)
names.head()

In [None]:
# get total births by sex and year using groupby
# again unstack pivots the lowest level of the hierarchical index (rows) to columns
# in this case, "sex" is the lowest level index
names.groupby(["year", "sex"])["births"].sum().unstack()

In [None]:
# and using pivot_table
births_per_year = names.pivot_table("births", index="year", columns="sex", aggfunc="sum")
births_per_year.plot(title="Total births by sex and year")

In [None]:
names

In [None]:
def prop(group):
    """Calculate the relative frequency of each name relative to the total number of births"""
    group["prop"] = group["births"] / group["births"].sum()
    return group

# calculate relative frequency of each name per year and sex
names = names.groupby(["year", "sex"]).apply(prop, include_groups=False).reset_index().drop(columns="level_2")

In [None]:
# sanity check to see whether the prop column sums up to 1 within each group
names.groupby(["year", "sex"])["prop"].sum()

In [None]:
# top 1000 names for each sex/year combination
def get_top_1000(group):
    """Returns the 1000 most frequent names for each sex/year combination"""
    return group.sort_values("births", ascending=False)[:1000]

grouped = names.groupby(["year", "sex"])

top1000 = grouped.apply(get_top_1000, include_groups=False).reset_index(level=[0, 1])
top1000

## Analyzing Naming Trends

In [None]:
boys = top1000[top1000["sex"] == "M"]
girls = top1000[top1000["sex"] == "F"]

In [None]:
total_births = top1000.pivot_table("births", index="year", columns="name", aggfunc="sum")

In [None]:
total_births.info()

In [None]:
subset = total_births[["John", "Harry", "Mary", "Marilyn"]]
subset.plot(subplots=True, figsize=(12, 10), title="Number of births per year")

In [None]:
top1000.head()

In [None]:
# measuring the increase in naming diversity
table = top1000.pivot_table("prop", index="year", columns="sex", aggfunc="sum")
table.plot(title="Sum of table1000.prop by year and sex", yticks=np.linspace(0, 1.2, 13))

In [None]:
# now look at number of distinct names taken in order of popularity from highest to lowest
df = boys[boys["year"] == 2010]
df

In [None]:
# sort prop in descending order
prop_cumsum = df["prop"].sort_values(ascending=False).cumsum()
prop_cumsum[:10]

In [None]:
# how many names does it take to reach 50 % of all births
# do not forget that arrays are 0-indexed: the actual value is 1 larger
prop_cumsum.searchsorted(0.5)

In [None]:
df

In [None]:
# now let's check for 1900
prop_cumsum1900 = boys[boys["year"] == 1900]
prop_cumsum1900["prop"].sort_values(ascending=False).cumsum().searchsorted(0.5) + 1

In [None]:
def get_quantile_count(group, q=0.5):
    """Returns the number of names it needs to reach 50 % of all births in a given group"""
    group = group.sort_values("prop", ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count, include_groups=False)
diversity = diversity.unstack()
diversity

In [None]:
diversity.plot(title="Number of popular names in top 50 %")

In [None]:
names.head()

In [None]:
def get_last_letter(x):
    """Returns the last letter of a string"""
    return x[-1]

last_letters = names["name"].map(get_last_letter)
last_letters.name = "last_letter"
last_letters.head()

In [None]:
table = names.pivot_table("births", index=last_letters, columns=["sex", "year"], aggfunc="sum")

# select a few representative years
subtable = table.reindex(columns=[1910, 1960, 2010], level="year")
subtable.head()

In [None]:
letter_prop = subtable / subtable.sum()
letter_prop

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop["M"].plot(kind="bar", rot=0, ax=axes[0], title="Male")
letter_prop["F"].plot(kind="bar", rot=0, ax=axes[1], title="Female", legend=False)

In [None]:
letter_prop = table / table.sum()
dny_ts = letter_prop.loc[["d", "n", "y"], "M"].T
dny_ts.head()

In [None]:
dny_ts.plot()

In [None]:
all_names = pd.Series(top1000["name"].unique())
lesley_like = all_names[all_names.str.contains("Lesl")]
lesley_like

In [None]:
# choose all rows that have a lesley_like name
filtered = top1000[top1000["name"].isin(lesley_like)]

# group the data by name and calculate the total births of each name
filtered.groupby("name")["births"].sum()

In [None]:
# create a pivot table that contains the total number of births for each year and split across sex
table = filtered.pivot_table("births", index="year", columns="sex", aggfunc="sum")

# divide each element in the dataframe with the sum of values per column
# so we calculate the proportion of male and female persons named lesley like per year
table = table.div(table.sum(axis="columns"), axis="index")
table.head()

In [None]:
table.plot(style={"M": "k-", "F": "k--"})

# 13.4 USDA Food Database

In [None]:
import json

db = json.load(open("datasets/usda_food/database.json"))
len(db)

In [None]:
type(db), type(db[0])

In [None]:
db[0]

In [None]:
nutrients = pd.DataFrame(db[0]["nutrients"])
nutrients.head()

In [None]:
# we can directly convert a list of dictionaries to a dataframe
info_keys = ["description", "group", "id", "manufacturer"]
info = pd.DataFrame(db, columns=info_keys)
info.head()

In [None]:
info.info()

In [None]:
info["group"].value_counts()

In [None]:
nutrients = []
for rec in db:
    fnuts = pd.DataFrame(rec["nutrients"])
    fnuts["id"] = rec["id"]
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

In [None]:
nutrients.head()

In [None]:
print(nutrients.duplicated().sum())

# drop any duplicates
nutrients = nutrients.drop_duplicates()

In [None]:
col_mapping = {"description": "food", "group": "fgroup"}

info = info.rename(columns=col_mapping, copy=False)
info.info()

In [None]:
col_mapping = {"description": "nutrient", "group": "nutgroup"}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
nutrients

In [None]:
ndata = pd.merge(nutrients, info, on="id")
ndata.info()

In [None]:
ndata.iloc[10000]

In [None]:
# plot median values per nutrient and food group
result = ndata.groupby(["nutrient", "fgroup"])["value"].quantile(0.5)
result["Zinc, Zn"].sort_values().plot(kind="barh")

In [None]:
result["Iron, Fe"].sort_values().plot(kind="barh")

In [None]:
ndata.head()

In [None]:
by_nutrient = ndata.groupby(["nutgroup", "nutrient"])

def get_maximum(x):
    """Returns the maximum value"""
    return x.loc[x.value.idxmax()]
max_foods = by_nutrient.apply(get_maximum, include_groups=False)[["value", "food"]]
max_foods["food"] = max_foods["food"].str[:50]

In [None]:
max_foods.loc["Amino Acids"]["food"]

# 13.5 2012 Federal Election Commission Database

In [None]:
fec = pd.read_csv("datasets/fec/P00000001-ALL.csv", low_memory=False)
fec.info()

In [None]:
fec.iloc[123456]

In [None]:
unique_cands = fec["cand_nm"].unique()
unique_cands

In [None]:
parties = {
    "Bachmann, Michelle": "Republican",
    "Cain, Herman": "Republican",
    "Gingrich, Newt": "Republican",
    "Huntsman, Jon": "Republican",
    "Johnson, Gary Earl": "Republican",
    "McCotter, Thaddeus G": "Republican",
    "Obama, Barack": "Democrat",
    "Paul, Ron": "Republican",
    "Pawlenty, Timothy": "Republican",
    "Perry, Rick": "Republican",
    "Roemer, Charles E. 'Buddy' III": "Republican",
    "Romney, Mitt": "Republican",
    "Santorum, Rick": "Republican"
}

In [None]:
fec["party"] = fec["cand_nm"].map(parties)
fec["party"].value_counts()

In [None]:
(fec["contb_receipt_amt"] > 0).value_counts()

In [None]:
# make sure only positive amounts are in the data
fec = fec[fec["contb_receipt_amt"] > 0]

In [None]:
# pick the 2 main candidates
fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])]

## Donation Statistics by Occupation and Employer

In [None]:
fec["contbr_occupation"].value_counts()[:50]

In [None]:
# same profession is referred to in different terms
occ_mapping = {
   "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
   "INFORMATION REQUESTED" : "NOT PROVIDED",
   "INFORMATION REQUESTED (BEST EFFORTS)" : "NOT PROVIDED",
   "C.E.O.": "CEO"
}

def get_occ(x):
    """Returns the correct mapping or the value itself"""
    return occ_mapping.get(x, x)

fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ)

In [None]:
# same for employers
emp_mapping = {
   "INFORMATION REQUESTED PER BEST EFFORTS" : "NOT PROVIDED",
   "INFORMATION REQUESTED" : "NOT PROVIDED",
   "SELF" : "SELF-EMPLOYED",
   "SELF EMPLOYED" : "SELF-EMPLOYED",
}

def get_emp(x):
    """Returns the correct mapping or the value itself"""
    return emp_mapping.get(x, x)

fec["contbr_employer"] = fec["contbr_employer"].map(get_emp)

In [None]:
fec.head()

In [None]:
by_occupation = fec.pivot_table("contb_receipt_amt", index="contbr_occupation", columns="party", aggfunc="sum")
over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000]
over_2mm

In [None]:
over_2mm.plot(kind="barh")

In [None]:
def get_top_amounts(group, key, n=5):
    """Returns the payees with the highest amounts"""
    total_amts = group.groupby(key)["contb_receipt_amt"].sum()
    return total_amts.nlargest(n)

# we group by the candidate name
grouped = fec_mrbo.groupby("cand_nm")

# then we apply the function get_top_amounts
# which groups again by some column and calculates the sum for that
# group of all donations
grouped.apply(get_top_amounts, "contbr_occupation", n=7, include_groups=False)

In [None]:
grouped.apply(get_top_amounts, "contbr_employer", n=10, include_groups=False)

## Bucketing Donation Amounts

In [None]:
## Bucketing Donation Amounts
bins = np.array([0, 1, 10, 100, 1000, 10000, 100_000, 1_000_000, 10_000_000])
labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins)
labels

In [None]:
# group by candidate name and bin label
grouped = fec_mrbo.groupby(["cand_nm", labels], observed=False)
grouped.size().unstack(level=0)

In [None]:
bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"), axis="index")
normed_sums

In [None]:
normed_sums[:-2].plot(kind="barh")

## Donation Statistics by State

In [None]:
# we group by candidate and state
grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"])

# compute the sum of the amount and unstack the candidate
totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0)

# pick states with more than 100_000 total donation amount
totals = totals[totals.sum(axis="columns") > 100_000]
totals

In [None]:
# normalize per state: get the percentage of donations for each candidate per state
percent = totals.div(totals.sum(axis="columns"), axis="index")
percent