Skip to content
This repository has been archived by the owner on Jan 19, 2021. It is now read-only.

[Fix Bug 699530] add mysql anonymization script and configuration files #695

Merged
merged 2 commits into from Nov 6, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/index.rst
Expand Up @@ -23,6 +23,7 @@ Contents:
invites
contribute
api
mysql-anonymize

Indices and tables
------------------
Expand Down
19 changes: 19 additions & 0 deletions docs/mysql-anonymize.rst
@@ -0,0 +1,19 @@
============
MySQL DB Anonymization
============

Mozillians uses the production database for testing on stage and dev.
We provide a script to anonymize a database to remove some
personal information for stage, and all personal information for dev.

#. Using the script::

$ cd scripts/mysql-anonymize
$ python anonymize.py anonymize_dev.yml > anon.sql
$ mysql < anon.sql

.. note::

Make sure your database is named ``mozillians``. If it isn't, you can change
the name in the .yml file you are using, it's clearly noted on the second line
of each of the .yml configuration files.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the docs!

132 changes: 132 additions & 0 deletions scripts/mysql-anonymize/anonymize.py
@@ -0,0 +1,132 @@
#!/usr/bin/env python
"""
Mozillians database anonymization script

This assumes an id field in each table.
Forked from: https://github.com/davedash/mysql-anonymous
"""
import logging
import hashlib
import random


log = logging.getLogger('anonymize')
common_hash_secret = '%016x' % (random.getrandbits(128))


def get_truncates(config):
database = config.get('database', {})
truncates = database.get('truncate', [])
sql = []
for truncate in truncates:
sql.append('TRUNCATE %s' % truncate)
return sql


def get_deletes(config):
database = config.get('database', {})
tables = database.get('tables', [])
sql = []
for table, data in tables.iteritems():
if 'delete' in data:
fields = []
for f, v in data['delete'].iteritems():
fields.append('%s = "%s"' % (f, v))
statement = 'DELETE FROM %s WHERE ' % table + ' AND '.join(fields)
sql.append(statement)
return sql

listify = lambda x: x if isinstance(x, list) else [x]

def get_updates(config):
global common_hash_secret

database = config.get('database', {})
tables = database.get('tables', [])
sql = []
for table, data in tables.iteritems():
updates = []
for operation, details in data.iteritems():
if operation == 'nullify':
for field in listify(details):
updates.append('%s = NULL' % field)
elif operation == 'random_int':
for field in listify(details):
updates.append('%s = ROUND(RAND()*1000000)' % field)
elif operation == 'random_ip':
for field in listify(details):
updates.append('%s = INET_NTOA(RAND()*1000000000)' % field)
elif operation == 'random_email_noadmin':
for field in listify(details):
# Must be a separate statement due to WHERE clause.
sql.append('UPDATE %s SET %s = CONCAT(id, "@mozilla.com")'
' WHERE is_staff=0' % (table,field))
elif operation == 'random_email':
for field in listify(details):
updates.append('%s = CONCAT(id, "@mozilla.com")'
% field)
elif operation == 'random_username':
for field in listify(details):
updates.append('%s = CONCAT("_user_", id)' % field)
elif operation == 'hash_value':
for field in listify(details):
updates.append('%(field)s = MD5(CONCAT(@common_hash_secret, %(field)s))'
% dict(field=field))
elif operation == 'hash_email':
for field in listify(details):
updates.append('%(field)s = CONCAT(MD5(CONCAT(@common_hash_secret, %(field)s)), "@mozilla.com")'
% dict(field=field))
elif operation == 'empty_string':
for field in listify(details):
updates.append('%(field)s = ""' %dict(field=field))
elif operation == 'delete':
continue
else:
log.warning('Unknown operation.')
if updates:
sql.append('UPDATE %s SET %s' % (table, ', '.join(updates)))
return sql


def anonymize(config):
database = config.get('database', {})

if 'name' in database:
print 'USE %s;' % database['name']

print 'SET FOREIGN_KEY_CHECKS=0;'

sql = []
sql.extend(get_truncates(config))
sql.extend(get_deletes(config))
sql.extend(get_updates(config))
for stmt in sql:
print stmt + ';'

print 'SET FOREIGN_KEY_CHECKS=1;'
print

if __name__ == '__main__':

import yaml
import sys

if len(sys.argv) > 1:
files = sys.argv[1:]
else:
files = [ 'anonymize.yml' ]

for f in files:
print '--'
print '-- %s' %f
print '--'
print 'SET @common_hash_secret=rand();'
print ''
cfg = yaml.load(open(f))
if 'databases' not in cfg:
anonymize(cfg)
else:
databases = cfg.get('databases')
for name, sub_cfg in databases.items():
print 'USE %s;' % name
anonymize({'database': sub_cfg})
31 changes: 31 additions & 0 deletions scripts/mysql-anonymize/anonymize_dev.yml
@@ -0,0 +1,31 @@
databases:
mozillians:

tables:
auth_user:
nullify:
- first_name
- last_name
random_email_noadmin:
- email
random_username:
- username

profile:
hash_value:
- full_name
- title
empty_string:
- bio
- photo
- region
- city
- basket_token
nullify:
- tshirt
random_username:
- ircname

users_externalaccount:
random_username:
- username
7 changes: 7 additions & 0 deletions scripts/mysql-anonymize/anonymize_stage.yml
@@ -0,0 +1,7 @@
databases:
mozillians:

tables:
auth_user:
random_email_noadmin:
- email