# GDPR Cases - python analysis

| [screencast](https://youtu.be/z6xNKZZMWgU) | [notebook](https://github.com/machow/tidytuesday-py/blob/master/2020-04-21-gdpr.ipynb) |

In [1]:
import pandas as pd
from siuba import _, mutate, count, filter, group_by, ungroup, head
from siuba import add_count
from siuba.dply.forcats import fct_reorder, fct_lump
from siuba.experimental.datetime import floor_date
from plotnine import *

gdpr_violations = pd.read_csv(
        'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv',
        sep = "\t", parse_dates = ["date"]
    )

gdpr_text = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv',
    sep = "\t"
)

NameError: name 'pd' is not defined

## Countries with most fines

In [None]:
(
    gdpr_violations >> count(_.name, sort = True)
    >> mutate(name = fct_reorder(_.name, -_.n))
    >> ggplot(aes("name", "n", group = 1)) + geom_point() + geom_line()
    + theme(axis_text_x = element_text(angle = 45, hjust = 1))
)

In [None]:
gdpr_violations >> count(_.name, _.controller, sort = True)

gdpr_violations >> filter(_.controller.str.contains("[Gg]oogle"))

## GDPR cases over time

In [None]:
(
    gdpr_violations
    >> mutate(week = floor_date(_.date, "W"))
    >> filter(_.week != "1970-01-01")
    >> count(_.week)
    >> mutate(ttl = _.n.cumsum())
    >> ggplot(aes("week", "ttl", group = 1)) + geom_line()
    + expand_limits(y = 0)
    + theme(axis_text_x = element_text(angle = 45, hjust = 1))
    + labs(title = "GDPR cases over time", y = "cumulative cases")
#    + scale_y_log10()
)

## Size of fines

In [None]:
from siuba import arrange

(
    gdpr_violations
    >> filter(_.date != "1970-01-01")
    >> ggplot(aes("date", "price"))
    + geom_point()
    + theme(axis_text_x=element_text(angle=45, hjust=1))
    + labs(title = "GDPR fine size over time")
)

## Type x Price

In [None]:
# multiple articles violated
# * what is type?
# connecting article to other data

#gdpr_violations

In [None]:
type_counts = gdpr_violations >> count(_.type, sort = True)

In [None]:
from siuba.dply.forcats import fct_lump

(
    gdpr_violations
    >> add_count(_.type)
    >> mutate(type = fct_lump(_.type, n = 6))
    >> ggplot(aes("type", "price", group="type"))
    + geom_boxplot()
    + scale_y_log10(labels = lambda arr: ["{:,}".format(x) for x in arr])
    + theme(axis_text_x=element_text(angle=45, hjust=1))
)

## Article violated x price

In [None]:
from siuba import unnest, rename

gdpr_articles_nested = gdpr_violations >> mutate(
    articles=_.article_violated.str.split("|"),
    n_violated=_.articles.apply(len)
)

# example ---
# unnest(example.loc[[1], :], "articles")
gdpr_articles = (
    gdpr_articles_nested
    >> unnest("articles")
    >> mutate(article_number = _.articles.str.extract(r"Art. *([0-9]+)").astype(float))
)

In [None]:
(
    gdpr_articles_nested
    >> ggplot(aes("n_violated", "price", group="n_violated"))
    + geom_violin()
    + scale_y_log10(labels=lambda arr: ["{:,}".format(x) for x in arr])
    + geom_point(position = position_jitter(height = 0, width = .25))
)

In [None]:
gdpr_articles >> count(_.article_number, sort = True) >> head()

In [None]:
from siuba import inner_join, distinct

joined_articles = (
    gdpr_text
    >> distinct(_.article, _.article_title)
    >> inner_join(_, gdpr_articles, {"article": "article_number"})
)

In [None]:
# sanity check that number of rows is close to gdpr_articles
#joined_articles

In [None]:
(
    joined_articles
    >> mutate(article_title=fct_lump(_.article_title, n=6))
    >> ggplot(aes("article_title", "price", "article_title")) + geom_boxplot()
        + scale_y_log10(labels = lambda arr: ["{:,}".format(x) for x in arr])
    + theme(axis_text_x=element_text(angle=45, hjust=1))
)