# Migration remove brasilian-pt accounts

In [None]:
# required imports to access api_db, misc, misc.CONFIG, ...
import sys
sys.path = ['.', '..', '../..'] + sys.path
from collection import *

<hr>
<h1 align="center">driver code</h1>

Run all necessary instructions to properly setup the database (idempotent operations)

In [None]:
from pymongo import DeleteMany, DeleteOne
class BatchRemover:
    def __init__(self, batch_size=500):
        self.batch_size = batch_size

    def __enter__(self):
        self.batch_users = []
        self.batch_tweets = []
        return self

    def __exit__(self, _type, _value, _traceback):
        # delete any if missing
        print("Batch removing ended, looking for missing batches to send")
        if len(self.batch_users): self.perform_delete_users()
        if len(self.batch_tweets): self.perform_delete_tweets()

    def delete_user(self, user):
        self.batch_users.append(DeleteOne({"_id": user["_id"]}))
        self.batch_tweets.append(DeleteMany({"user": user['_id']}))
        self.check_pending_deletes() # check if enough to perform db operation

    def check_pending_deletes(self):
        if len(self.batch_users) > self.batch_size: self.perform_delete_users()
        if len(self.batch_tweets) > self.batch_size: self.perform_delete_tweets()
    
    def perform_delete_users(self):
        # send batch delete to db
        api_db.col_users.bulk_write(self.batch_users, ordered=False)
        print("Deleted another batch of %d users" % self.batch_size)
        self.batch_users = []
    
    def perform_delete_tweets(self):
        # send batch delete to db
        api_db.col_tweets.bulk_write(self.batch_tweets, ordered=False)
        print("Deleted another batch of %d user tweets" % self.batch_size)
        self.batch_tweets = []

In [None]:
foreign_locations = ["rio de janeiro", "salvador", "itabuna", "bahia", "recife", "copa", "fortaleza", "manaus", "brazil", "sao paulo", "são paulo", "maracanã", "maracana", "belo horizonte", "🇧🇷", "mato grosso", "mato g.", "curitiba", "alagoinhas", "rj", "brasilia", "brasília", "porto alegre", "brazuca" "-br", "🇪🇸", "spain", "madrid", "🇺🇸", "espana", "españa", "italia", "italy", "rome", "greece", "athens", "aθήνα", "eλλάδα", "🇫🇷", "france", "paris"]
foreign_descriptions = ["brasil", "brazil", "bolsonaro", "lula", "🇧🇷", "brazuca", " br ", "🇪🇸", "🇺🇸", "🇫🇷"]

def is_foreign(user):
    loc = user["location"].lower() if "location" in user else ""
    desc = user["description"].lower() if "description" in user else ""
    if "portugal" in loc: return False
    
    if len(loc) and any(x in loc for x in foreign_locations): return True
    if len(desc) and any(x in desc for x in foreign_descriptions): return True
    return False

In [None]:
find_params = find_exclude_invalid({"depth": {"$gt": 0}})
counter = 0
with BatchRemover(500) as remover:
    for user in api_db.col_users.find(find_params, {"location": True, "description": True}):
        if is_foreign(user): # remove these users's tweets and then the users
            counter+=1
            remover.delete_user(user)

In [None]:
print("Removed %d foreign users with location" % counter)

In [None]:
# remove accounts that are not tweeting in portuguese with more than 50 tweets and less than 10% of pt content
foreign_users = api_db.col_users.find({
    "count_parsed_tweets": {"$gte": 50},
    "most_common_language": {"$not": {"$in": ["pt", "und"]}},
    "$or": [
        {"tweeted_languages.pt": {"$exists": False}},
        {"tweeted_languages.pt": {"$lte": 5}}
    ]
}, {})
counter = 0
with BatchRemover(500) as remover:
    for user in foreign_users:
        counter+=1
        remover.delete_user(user)


In [None]:
print("Removed %d foreign users with most_common_language 1" % counter)

In [None]:
# remove accounts that are not tweeting in portuguese with more than 25 tweets and less than 10% of pt content
foreign_users = api_db.col_users.find({
    "count_parsed_tweets": {"$gte": 25},
    "most_common_language": {"$not": {"$in": ["pt", "und"]}},
    "$and": [
    {"$or": [
        {"follows_political": {"$lte": 2}},
        {"follows_political": {"$exists": False}}
    ]},
    {"$or": [
        {"tweeted_languages.pt": {"$exists": False}},
        {"tweeted_languages.pt": {"$lte": 2}}
    ]}
    ]
}, {})
counter = 0
with BatchRemover(500) as remover:
    for user in foreign_users:
        counter+=1
        remover.delete_user(user)


In [None]:
print("Removed %d foreign users with most_common_language 2" % counter)

In [None]:
print("DONE")