Data cleaning for the gdpr violations dataset. To be used with the interactive Vega Lite visualization. Source: https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-04-21/readme.md 

In [57]:
import pandas as pd
import numpy as np
!pip install geopandas
import geopandas as gpd
from shapely.wkt import loads

#read dataframe from tsv
df = pd.read_table("C:/Users/mattx/Downloads/gdpr_violations.tsv")

#basic info on dataframe columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    int64 
 1   picture           250 non-null    object
 2   name              250 non-null    object
 3   price             250 non-null    int64 
 4   authority         250 non-null    object
 5   date              250 non-null    object
 6   controller        250 non-null    object
 7   article_violated  250 non-null    object
 8   type              250 non-null    object
 9   source            250 non-null    object
 10  summary           250 non-null    object
dtypes: int64(2), object(9)
memory usage: 21.6+ KB


In [58]:
#checking for duplicate values
df.duplicated().sum()

0

In [59]:
#changing ID to object to reflect categorical nature
df['id'] = df['id'].astype(object)

In [60]:
#removing source, summary, and picture, as they are not relevant to data visualization
df = df.drop(columns=['picture', 'source', 'summary'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    object
 1   name              250 non-null    object
 2   price             250 non-null    int64 
 3   authority         250 non-null    object
 4   date              250 non-null    object
 5   controller        250 non-null    object
 6   article_violated  250 non-null    object
 7   type              250 non-null    object
dtypes: int64(1), object(7)
memory usage: 15.8+ KB


In [61]:
#checking for missing data
for item in ['id', 'name', 'price', 'authority', 'date', 'controller', 'article_violated', 'type']:
    pct_missing = df[item].isnull().sum() * 100 /len(df)
    print(f"{item}: {pct_missing}")
    
#no missing data

id: 0.0
name: 0.0
price: 0.0
authority: 0.0
date: 0.0
controller: 0.0
article_violated: 0.0
type: 0.0


In [62]:
#adding in geospatial data
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

df = pd.merge(df, world[['name', 'geometry']], how='left', on='name')

print(df.head())

  id     name   price                                          authority  \
0  1   Poland    9380  Polish National Personal Data Protection Offic...   
1  2  Romania    2500  Romanian National Supervisory Authority for Pe...   
2  3    Spain   60000           Spanish Data Protection Authority (AEPD)   
3  4    Spain    8000           Spanish Data Protection Authority (AEPD)   
4  5  Romania  150000  Romanian National Supervisory Authority for Pe...   

         date          controller  \
0  10/18/2019        Polish Mayor   
1  10/17/2019    UTTIS INDUSTRIES   
2  10/16/2019  Xfera Moviles S.A.   
3  10/16/2019  Iberdrola Clientes   
4  10/09/2019  Raiffeisen Bank SA   

                                    article_violated  \
0                                       Art. 28 GDPR   
1  Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...   
2                            Art. 5 GDPR|Art. 6 GDPR   
3                                       Art. 31 GDPR   
4                                       

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))


In [55]:
#splitting multiple gdpr violations into rows and making a new dataframe with countrys/violations

new = []

for index, row in df.iterrows():
    violations = row['article_violated'].split('|')
    
    for violation in violations:
        uniquerow = {
          'name':row['name'],
            'geometry':row['geometry'],
            'article_violated':violation.strip()
        }
        new.append(uniquerow)

newdf = pd.DataFrame(new)

print(newdf.head())

      name                                           geometry  \
0   Poland  POLYGON ((23.48412763844985 53.91249766704114,...   
1  Romania  POLYGON ((28.23355350109904 45.48828318946829,...   
2  Romania  POLYGON ((28.23355350109904 45.48828318946829,...   
3  Romania  POLYGON ((28.23355350109904 45.48828318946829,...   
4  Romania  POLYGON ((28.23355350109904 45.48828318946829,...   

     article_violated  
0        Art. 28 GDPR  
1        Art. 12 GDPR  
2        Art. 13 GDPR  
3  Art. 5 (1) c) GDPR  
4         Art. 6 GDPR  


In [56]:
#exporting data to geojson
gdf = gpd.GeoDataFrame(df, geometry="geometry")
newdf.to_file('cleaned_gdpr_data_map.geojson',driver="GeoJSON" )

TypeError: Expected bytes or string, got Polygon