In [1]:
from website.app import init_app

init_app()

[root]  DEBUG: Setting storage backends
[root]  INFO: Sentry disabled; Flask's debug mode enabled


<Flask 'framework.flask'>

In [None]:
# LOAD BLACKLIST GUIDS
from django.db import transaction
from framework.guid.model import BlacklistGuid
from osf_models.models import BlackListGuid
import gevent

odm_blacklist = BlacklistGuid.find()
total = len(odm_blacklist)
count = 0
page_size = 1000


def migrate_blacklist_item(guid):
    pg_guid = BlackListGuid.objects.create(guid=guid._id)

while count < total:
    with transaction.atomic():
        page = odm_blacklist[count:count+page_size]
        threads = []
        for guid in page:
            threads.append(gevent.spawn(migrate_blacklist_item, guid))
            count += 1
        gevent.joinall(threads)
        print 'Committing {} through {}'.format(count-page_size, count)
    
print count
print total

In [None]:
# LOAD GUIDS
from django.db import transaction
from framework.guid.model import Guid as MODMGuid
from osf_models.models import Guid
import gevent


guids = MODMGuid.find()
total = len(guids)
count = 0
page_size = 30000

def migrate_guid(guid):
    return Guid.objects.create(guid=guid)

while count < total:
    with transaction.atomic():
        threads = []
        for guid in guids[count:count+page_size].get_keys():
            threads.append(gevent.spawn(migrate_guid, guid))
            count += 1
            if count % page_size == 0:
                print count
        gevent.joinall(threads)
        print 'Committing {} through {}'.format(count-page_size, count)
        
print total
print count

In [None]:
# VERIFY GUIDS
from framework.guid.model import Guid as MODMGuid
from osf_models.models import Guid
import gevent
import copy

modm_guids = MODMGuid.find()
print 'MODM Guids: {}'.format(len(modm_guids))

guids = Guid.objects.filter(guid__in=modm_guids.get_keys())
filtered_count = len(guids)
total_count = Guid.objects.count()

if len(modm_guids) == filtered_count == total_count:
    print 'WINNING'
else:
    print 'LOSING'

print 'Postgres Guids: {}'.format(Guid.objects.count())

In [8]:
# MIGRATE NODES

from website.models import Node as MODMNode
from osf_models.models import Node
from modularodm import Q as MODMQ
from django.core.serializers.json import DjangoJSONEncoder
import json
from django.db import IntegrityError
import pytz
from datetime import datetime

modm_nodes = MODMNode.find()[1000:2000]

total = len(modm_nodes)
count = 0
print total

fk_node_fields = [
    'forked_from',
    'registered_from',
    'root',
    'parent_node',
    'template_node'
]
m2m_node_fields = [
    'nodes',
]
fk_user_fields = [
    'registered_user',
    'creator',
    'merged_by'
]
m2m_user_fields = [
    'permissions',
    'recently_added',
    'users_watching_node',
    'contributors'
]
m2m_tag_fields = [
    'tags',
    'system_tags'
]

node_cache = {
    # 'xyz12': {
    #     'modm': modm_object,
    #     'django': django_object,
    # }
}
user_cache = []
tag_cache  = []

node_key_blacklist = [
    '__backrefs', 
    '_version',
    'expanded',
    # foreign keys not yet implemented
    'logs', 
    'primary_institution', 
    'registration_approval',
    'alternative_citations',
    'registered_schema',
    'affiliated_institutions',
    'retraction',
    'embargo',
    'node_license',
] + m2m_node_fields + m2m_user_fields + m2m_tag_fields
user_key_blacklist = ['__backrefs', '_version', 'affiliated_institutions', 'watched', 'external_accounts',] + m2m_node_fields + m2m_user_fields + m2m_tag_fields

def process_node_fk_fields(modm_object):
    fk_nodes = {}
    for fk_node_field in fk_node_fields:
        value = getattr(modm_object, fk_node_field, None)
        if value is not None:
            if fk_node_field != 'root' and value != modm_object:
                node = get_or_create_node(value)
                if node is not None:
                    fk_nodes[fk_node_field] = node
            else:
                fk_nodes[fk_node_field] = None
    return fk_nodes

def process_node_m2m_fields(modm_object):
    m2m_nodes = {}
    for m2m_node_field in m2m_node_fields:
        value = getattr(modm_object, m2m_node_field, None)
        if value is not None:
            if isinstance(value, list):
                for nv in value:
                    if m2m_node_field in m2m_nodes:
                        node = get_or_create_node(nv)
                        if node is not None:
                            m2m_nodes[m2m_node_field].append(node)
                        else:
                            m2m_nodes[m2m_node_field] = [node, ]
            else:
                if m2m_node_field in m2m_nodes:
                    node = get_or_create_node(value)
                    if node is not None:
                        m2m_nodes[m2m_node_field].append(node)
                    else:
                        m2m_nodes[m2m_node_field] = [node, ]
    return m2m_nodes

def process_user_fk_fields(modm_object):
    fk_users = {}
    for fk_user_field in fk_user_fields:
        modm_user = getattr(modm_object, fk_user_field, None)
        if modm_user is not None:
            user = get_or_create_user(modm_user, fk_user_field, modm_object)
            if not user is None:
                fk_users[fk_user_field] = user
    return fk_users

def process_user_m2m_fields(modm_object):
    m2m_users = {}
    for m2m_user_field in m2m_user_fields:
        value = getattr(modm_object, m2m_user_field, None)
        if isinstance(value, list):
            for uv in value:
                if m2m_user_field in m2m_users:
                    user = get_or_create_user(uv)
                    if user is not None:
                        m2m_users[m2m_user_field].append(user)
                    else:
                         m2m_users[m2m_user_field] = [user, ]
                            
    return m2m_users

def process_tag_m2m_fields(modm_object):
    m2m_tags = {}
    for m2m_tag_field in m2m_tag_fields:
        value = getattr(modm_object, m2m_tag_field, None)
        if isinstance(value, list):
            for tv in value:
                if m2m_tag_field in m2m_tags:
                    tag = get_or_create_tag(tv)
                    if tag is not None:
                        m2m_tags[m2m_tag_field].append(user)
                    else:
                        m2m_tags[m2m_tag_field] = [tag, ]
                        
    return m2m_tags

def set_m2m_fields(object, fields):
    for key, value in fields.iteritems():
        attr = getattr(object, key)
        attr.add(*value)
    object.save()

def get_or_create_user(modm_user, field_name, modm_node=None):
    try:
        user = User.objects.get(guid__guid=modm_user._id)
    except User.DoesNotExist:
        user_fk_nodes = process_node_fk_fields(modm_user)
        user_m2m_nodes = process_node_m2m_fields(modm_user)
        user_fk_users = process_user_fk_fields(modm_user)
        user_m2m_users = process_user_m2m_fields(modm_user)
        user_m2m_tags = process_tag_m2m_fields(modm_user)
        user_fields = {}
        user_fields.update(modm_user.to_storage())
        user_fields.update(user_fk_nodes)
        user_fields.update(user_fk_users)
        user_fields = {k:v for k, v in user_fields.iteritems() if v is not None}
        for k, v in user_fields.iteritems():
            if isinstance(v, datetime):
                user_fields[k] = pytz.utc.localize(v)
        user = User.objects.create(**{key: user_fields[key] for key in user_fields if key not in user_key_blacklist})
        set_m2m_fields(user, user_m2m_nodes)
        set_m2m_fields(user, user_m2m_users)
        set_m2m_fields(user, user_m2m_tags)
    user_cache.append(user)
    return user
        

def get_or_create_tag(modm_tag, field_name, modm_node=None):
    try:
        tag = Tag.objects.get(_id=modm_tag._id)
    except Tag.DoesNotExist:
        tag = Tag.objects.create(**modm_tag.to_storage())
    tag_cache.append(tag)
    return tag

def get_or_create_node(modm_node):
    try:
        # try and get the node
        node = Node.objects.get(guid__guid=modm_node._id)
    except Node.DoesNotExist:  
        # if it doesn't exist, check to see if the guid does
        try:
            guid = Guid.objects.get(guid=modm_node._id)
        except Guid.DoesNotExist:
            # fail if the guid doesn't exist
            print 'GUID {} DoesNotExist'.format(modm_guid)
        else:
            fk_nodes = process_node_fk_fields(modm_node)
            
            m2m_nodes = process_node_m2m_fields(modm_node)
            
            fk_users = process_user_fk_fields(modm_node)
            
            m2m_users = process_user_m2m_fields(modm_node)
                    
            m2m_tags = process_tag_m2m_fields(modm_node)
                                    
            node_fields = {}
            node_fields.update(modm_node.to_storage())
            node_fields.update(fk_nodes)
            node_fields.update(fk_users)
            cleaned_node = {key: node_fields[key] for key in node_fields if key not in node_key_blacklist}
            for k, v in cleaned_node.iteritems():
                if isinstance(v, datetime):
                    cleaned_node[k] = pytz.utc.localize(v)
            # this shouldn't need to be here, not sure why it has to be
            cleaned_node['is_collection'] = cleaned_node.pop('is_folder')
            cleaned_node['is_bookmark_collection'] = cleaned_node.pop('is_dashboard')
            cleaned_node = {k:v for k, v in cleaned_node.iteritems() if v is not None}
            node = Node.objects.create(**cleaned_node)
            set_m2m_fields(node, m2m_nodes)
            set_m2m_fields(node, m2m_users)
            set_m2m_fields(node, m2m_tags)
    if modm_node._id not in node_cache:
        node_cache[modm_node._id] = dict()
    node_cache[modm_node._id]['django'] = node
    return node                

for modm_node in modm_nodes:
    node_cache[modm_node._id] = {'modm': modm_node}
    noooood = get_or_create_node(modm_node)
    count +=1
    if count % 1000 == 0:
        print count

print 'Nodes: {}'.format(len(node_cache))
print 'Users: {}'.format(len(user_cache))
print 'Tags: {}'.format(len(tag_cache))
            
print 'MODM: {}'.format(total)
print 'PG: {}'.format(count)

1000
1000
Nodes: 2095
Users: 3862
Tags: 0
MODM: 1000
PG: 1000


In [None]:
# MIGRATE REGISTRATIONS

from website.models import Node as MODMNode
from osf_models.models import Registration
from modularodm import Q as MODMQ
from django.core.serializers.json import DjangoJSONEncoder
import json
from django.db import IntegrityError

modm_nodes = MODMNode.find(
    MODMQ('is_registration', 'eq', True) &
    MODMQ('is_folder', 'ne', True)
)

total = len(modm_nodes)
count = 0
print total

for modm_node in modm_nodes:
    try:
        guid = Guid.objects.get(guid=modm_node._id)
    except Guid.DoesNotExist:
        print 'Guid {} DoesNotExist'.format(modm_guid)
    else:
        try:
            node = Registration.objects.create(guid=guid, data=modm_node.to_storage())
        except IntegrityError as ex:
            print 'Registration existed: {}'.format(ex)
        count +=1
        if count % 100 == 0:
            print count
            
print 'MODM: {}'.format(total)
print 'PG: {}'.format(count)

In [None]:
# MIGRATE COLLECTIONS

from website.models import Node as MODMNode
from osf_models.models import Collection
from modularodm import Q as MODMQ
from django.core.serializers.json import DjangoJSONEncoder
import json
from django.db import IntegrityError

modm_nodes = MODMNode.find(
    MODMQ('is_folder', 'eq', True)
)

total = len(modm_nodes)
count = 0
print total

for modm_node in modm_nodes:
    try:
        guid = Guid.objects.get(guid=modm_node._id)
    except Guid.DoesNotExist:
        print 'Guid {} DoesNotExist'.format(modm_guid)
    else:
        try:
            node = Collection.objects.create(guid=guid, data=modm_node.to_storage())
        except IntegrityError as ex:
            print 'Collection existed: {}'.format(ex)
        count +=1
        if count % 100 == 0:
            print count
            
print 'MODM: {}'.format(total)
print 'PG: {}'.format(count)

In [None]:
# BENCHMARK /users/me/nodes/

import timeit

modm = """
from website.models import Node
from modularodm import Q
nodes = Node.find(Q('contributors','eq','cdi38') & Q('is_deleted','eq', False) & Q('is_folder','eq', False))[:60]
for node in nodes:
    id = node._id
"""
django = """
from osf_models.models import Node
nodes = Node.objects.filter(data__contributors__contains='"cdi38"', data__contains='{"is_deleted": false, "is_folder": false}')[:60]
for node in nodes:
    id = node.guid_id
    id = node.guid.id
"""
print "django, mongo"
for i in range(0,20):
    d = timeit.timeit(stmt=django, number=50)
    m = timeit.timeit(stmt=modm, number=50)
    print '{} {}'.format(d, m)

In [None]:
from osf_models.models import Node
nodes = Node.objects.filter(data__contributors__contains='"cdi38"')
print nodes.query
print nodes.count()

In [None]:
mia_guids = Guid.objects.filter(referent_node=None, referent_registration=None, referent_collection=None).count()
accounted_for_guids = Guid.objects.exclude(referent_node=None, referent_registration=None, referent_collection=None).count()
total_guids = Guid.objects.all().count()
total_guids == accounted_for_guids + mia_guids