Data cleaning for the gdpr violations dataset. To be used with the interactive Vega Lite visualization. Source: https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-04-21/readme.md 

In [113]:
import pandas as pd
import numpy as np
!pip install geopandas
import geopandas as gpd
from shapely.wkt import loads

#read dataframe from tsv
df = pd.read_table("C:/Users/mattx/Downloads/gdpr_violations.tsv")

#basic info on dataframe columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    int64 
 1   picture           250 non-null    object
 2   name              250 non-null    object
 3   price             250 non-null    int64 
 4   authority         250 non-null    object
 5   date              250 non-null    object
 6   controller        250 non-null    object
 7   article_violated  250 non-null    object
 8   type              250 non-null    object
 9   source            250 non-null    object
 10  summary           250 non-null    object
dtypes: int64(2), object(9)
memory usage: 21.6+ KB


In [115]:
#checking for duplicate values
df.duplicated().sum()

0

In [116]:
#changing ID to object to reflect categorical nature
df['id'] = df['id'].astype(object)

In [117]:
#removing source, summary, and picture, as they are not relevant to data visualization
df = df.drop(columns=['picture', 'source', 'summary'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    object
 1   name              250 non-null    object
 2   price             250 non-null    int64 
 3   authority         250 non-null    object
 4   date              250 non-null    object
 5   controller        250 non-null    object
 6   article_violated  250 non-null    object
 7   type              250 non-null    object
dtypes: int64(1), object(7)
memory usage: 15.8+ KB


In [118]:
#checking for missing data
for item in ['id', 'name', 'price', 'authority', 'date', 'controller', 'article_violated', 'type']:
    pct_missing = df[item].isnull().sum() * 100 /len(df)
    print(f"{item}: {pct_missing}")
    
#no missing data

id: 0.0
name: 0.0
price: 0.0
authority: 0.0
date: 0.0
controller: 0.0
article_violated: 0.0
type: 0.0


In [120]:
#summing the fines paid by the various countries
df = df.groupby('name')['price'].sum().reset_index()

print(df.head())

       name     price
0   Austria  18070100
1   Belgium     39000
2  Bulgaria   3226620
3   Croatia         0
4    Cyprus    121000


In [121]:
#adding in geospatial data
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

df = pd.merge(df, world[['name', 'geometry']], how='left', on='name')

print(df.head())

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


       name     price                                           geometry
0   Austria  18070100  POLYGON ((16.97967 48.12350, 16.90375 47.71487...
1   Belgium     39000  POLYGON ((6.15666 50.80372, 6.04307 50.12805, ...
2  Bulgaria   3226620  POLYGON ((22.65715 44.23492, 22.94483 43.82379...
3   Croatia         0  POLYGON ((16.56481 46.50375, 16.88252 46.38063...
4    Cyprus    121000  POLYGON ((32.73178 35.14003, 32.91957 35.08783...


In [111]:
#exporting data to geojson
gdf = gpd.GeoDataFrame(df, geometry="geometry")
gdf.to_file('cleaned_gdpr_data_map_price.geojson',driver="GeoJSON" )