In [7]:
import certifi
import re
from datetime import datetime
from pymongo import MongoClient, UpdateOne

In [8]:
# make a connection with your atlas
ca = certifi.where()

# Replace XYZ with your connection URI from the Atlas UI
url = 'XYZ'

client = MongoClient(url, tlsCAFile=ca)

In [9]:
# Data Processing

# regex for runtime
runtime_pat = re.compile(r'([0-9]+) min')

batch_size = 500
count = 0
updates = []

# loop: iterating each document of a collection
for movie in client.mflix.movies_scratch2.find({}):
    
    fields_to_set = {}
    fields_to_unset = {}
    
    # remvove null fields from document
    for key, value in movie.copy().items():
        if value == "" or value == [""]:
            del movie[key]
            fields_to_unset[key] = ""
            
    if 'director' in movie:
        fields_to_unset['director'] = ""
        fields_to_set['directors'] = movie['director'].split(", ")
    if 'cast' in movie:
        fields_to_set['cast'] = movie['cast'].split(", ")
    if 'writer' in movie:
        fields_to_set['writers'] = movie['writer'].split(", ")
        fields_to_unset['writer'] = ""
    if 'genre' in movie:
        fields_to_set['genres'] = movie['genre'].split(", ")
        fields_to_unset['genre'] = ""
    if 'language' in movie:
        fields_to_set['languages'] = movie['language'].split(", ")
        fields_to_unset['language'] = ''
    if 'country' in movie:
        fields_to_unset['country'] = ""
        fields_to_set['countries'] = movie['country'].split(", ")
        
    if 'fullplot' in movie:
        fields_to_unset['fullplot'] = ""
        fields_to_set['fullPlot'] = movie['fullplot']
    if 'rating' in movie:
        fields_to_unset['rating'] = ""
        fields_to_set['rated'] = movie['rating']
    
    imdb = {}
    if 'imdbID' in movie:
        imdb['id'] = movie['imdbID']
        fields_to_unset['imdbID'] = ""
    if 'imdbRating' in movie:
        fields_to_unset['imdbRating'] = ""
        imdb['rating'] = movie['imdbRating']
    if 'imdbVotes' in movie:
        fields_to_unset['imdbVotes'] = ""
        imdb['votes'] = movie['imdbVotes']
    if imdb:
        fields_to_set['imdb'] = imdb
    
    if 'released' in movie:
        fields_to_set['released'] = datetime.strptime(movie['released'], "%Y-%m-%d")
    
    if 'lastupdated' in movie:
        fields_to_unset['lastupdated'] = ''
        time = movie['lastupdated'].split(".")[0]
        fields_to_set['lastUpdated'] = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
    
    if 'runtime' in movie:
        m = runtime_pat.match(movie['runtime'])
        if m:
            fields_to_set['runtime'] = int(m.group(1))
            
    update_doc = {}
    if fields_to_set:
        update_doc['$set'] = fields_to_set
    if fields_to_unset:
        update_doc['$unset'] = fields_to_unset
        
    updates.append(UpdateOne({'_id': movie['_id']}, update_doc))
    
    count += 1
    if count == batch_size:
        client.mflix.movies_scratch2.bulk_write(updates)
        updates = []
        count = 0
        
if updates:
    client.mflix.movies_scratch2.bulk_write(updates)