Data cleaning for the gdpr violations dataset. To be used with the interactive Vega Lite visualization. Source: https://github.com/rfordatascience/tidytuesday/blob/master/data/2020/2020-04-21/readme.md 

In [1]:
import pandas as pd
import numpy as np

#read dataframe from tsv
df = pd.read_table("C:/Users/mattx/Downloads/gdpr_violations.tsv")

#basic info on dataframe columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    int64 
 1   picture           250 non-null    object
 2   name              250 non-null    object
 3   price             250 non-null    int64 
 4   authority         250 non-null    object
 5   date              250 non-null    object
 6   controller        250 non-null    object
 7   article_violated  250 non-null    object
 8   type              250 non-null    object
 9   source            250 non-null    object
 10  summary           250 non-null    object
dtypes: int64(2), object(9)
memory usage: 21.6+ KB


In [2]:
#checking for duplicate values
df.duplicated().sum()

0

In [3]:
#changing ID to object to reflect categorical nature
df['id'] = df['id'].astype(object)

In [4]:
#removing source, summary, and picture, as they are not relevant to data visualization
df = df.drop(columns=['picture', 'source', 'summary'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                250 non-null    object
 1   name              250 non-null    object
 2   price             250 non-null    int64 
 3   authority         250 non-null    object
 4   date              250 non-null    object
 5   controller        250 non-null    object
 6   article_violated  250 non-null    object
 7   type              250 non-null    object
dtypes: int64(1), object(7)
memory usage: 15.8+ KB


In [5]:
#checking for missing data
for item in ['id', 'name', 'price', 'authority', 'date', 'controller', 'article_violated', 'type']:
    pct_missing = df[item].isnull().sum() * 100 /len(df)
    print(f"{item}: {pct_missing}")
    
#no missing data

id: 0.0
name: 0.0
price: 0.0
authority: 0.0
date: 0.0
controller: 0.0
article_violated: 0.0
type: 0.0


In [7]:
#splitting multiple gdpr violations into rows and making a new dataframe with countrys/violations

new = []

for index, row in df.iterrows():
    violations = row['article_violated'].split('|')
    
    for violation in violations:
        uniquerow = {
          'name':row['name'],
            'article_violated':violation.strip()
        }
        new.append(uniquerow)

newdf = pd.DataFrame(new)

print(newdf.head())

      name    article_violated
0   Poland        Art. 28 GDPR
1  Romania        Art. 12 GDPR
2  Romania        Art. 13 GDPR
3  Romania  Art. 5 (1) c) GDPR
4  Romania         Art. 6 GDPR


In [10]:
#exporting data to csv
newdf.to_csv('cleaned_gdpr_data_map.csv', index=False)