In [1]:
import random
import csv

from psycopg2 import connect, extras

from libs.connectors.postgres_connector import PostgresConnector
from libs.feeds.postgres_feed import PostgresFeed

from manglers.mangle_org_name import mangle_org_name
from manglers.mangle_url import mangle_url
from manglers.tld_swap_prob_dict import tld_swap_prob_dict

# connection details for source data table
source_host = 'silobuster-db-do-user-12298230-0.b.db.ondigitalocean.com'
source_user = 'jameyc'
source_passwd = 'UXZSXXXSFZeU8XKw'
source_db = 'defaultdb'
source_port = 25060

# connection details for training data table
training_host = 'silobuster-db-do-user-12298230-0.b.db.ondigitalocean.com'
training_user = 'jameyc'
training_passwd = 'UXZSXXXSFZeU8XKw'
training_db = 'jameycdb'
training_port = 25060
training_table = 'organizations_normalized'

csv_output = 'training_set1.csv'

# some of below probabilities are rough estimates based on organization table
# e.g. around 3% of non-blank URLs in organization table are missing scheme
url_mangling_probs_dict = {
    'www_remove_prob': .22, 
    'scheme_remove_prob': .03, 
    'remove_s_from_https_prob': .68,
    'append_extra_slash_prob': .35,
    # 'change_domain_ext_prob': .05, # arbitrary probability
    'mispell_remove_char_prob': .02, # arbitrary probability
    'mispell_replace_char_prob': .02, # arbitrary probability
    'mispell_null_url_prob': .02 # arbitrary probability
}

print ('Starting connections...')
source_conn = connect(
        database=source_db,
        user=source_user,
        password=source_passwd,
        host=source_host,
        port=source_port
    )

training_conn = connect(
        database=training_db,
        user=training_user,
        password=training_passwd,
        host=training_host,
        port=training_port
    )

select_qry = "select t1.name, t1.description, t1.url, t3.address_1, t3.address_2, t3.city, t3.region, t3.state_province, t3.postal_code, t3.country, t3.type from organization t1 left join location t2 on t1.id = t2.organization_id left join address t3 on t3.location_id = t2.id"
insert_qry = f"INSERT INTO {training_table} (name, description, url, address_1, address_2, city, state_province, postal_code, country, type, region) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id"
insert_dup_qry = f"INSERT INTO {training_table} (name, description, url, address_1, address_2, city, state_province, postal_code, country, type, region, duplicate_id, duplicate_type, training_set) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

print ("Connected?")
dups = []

# Data keys for mangling
address_keys = []
with open('./helper_data/street_suffixes.csv', 'r') as suffixes:
    csv_reader = csv.reader(suffixes)
    for row in csv_reader:
        row_clean = [suffix for suffix in row if suffix] # removing empty strings
        address_keys.append(row_clean)

state_keys = list()
state_keys.append(['wa', 'wash', 'washington'])

with source_conn.cursor(cursor_factory=extras.RealDictCursor) as source_cur:
    source_cur.execute(select_qry)
    data = source_cur.fetchall()
    print ("Retrieved data...")
    
    dup_address = random.randint(1,3)
    dup_blank_stuff = random.randint(1,2)
    
    for count, row in enumerate(data):
        
        print (f"Insert row count: {count}")

                
        name = row['name'].strip().lower()
        try:
            desc = row['description'].strip().lower()
        except:
            desc = ''
        try:
            url = row['url'].strip().lower()
        except:
            url = ''
        try:
            address_1 = row['address_1'].strip().lower()
        except:
            address_1 = ''
        try:
            address_2 = row['address_2'].strip().lower()
        except:
            address_2 = ''
        try:
            city = row['city'].strip().lower()
        except:
            city = ''
        try:
            region = row['region'].strip().lower()
        except:
            region = ''
        try:
            state = row['state_province'].strip().lower()
        except:
            state = ''
        try:
            postal = row['postal_code'].strip().lower()
        except:
            postal = ''
        try:
            country = row['country'].strip().lower()
        except:
            country = ''
        try:
            type_row = row['type'].strip().lower()
        except:
            type_row = ''
        
        # Write the rows to the normalized table
        with training_conn.cursor() as training_cur:
            training_cur.execute(insert_qry, [
                    name, 
                    desc, 
                    url, 
                    address_1, 
                    address_2, 
                    city, 
                    state, 
                    postal,
                    country, 
                    type_row,
                    region, 

            ])
            insert_id = training_cur.fetchone()[0]
            training_conn.commit()
        
        
        with open(csv_output, 'a') as csvfile:
            w = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
            w.writerow([
                name,
                desc,
                url,
                address_1,
                address_2,
                city,
                state,
                postal,
                country,
                type_row,
                region,
                '',
                '',
                0,
            ])

        #create a duplicate/mangled name
        mangled_name = mangle_org_name(
            name, 
            remove_prob = 0.02,
            replace_prob = 0.02,
            null_prob = 0.02
        )

        duplicate_type = []
        if mangled_name != name:
            duplicate_type.append('mangled_name')

        mangled_url = mangle_url(url, url_mangling_probs_dict, tld_swap_prob_dict)
        if mangled_url != url:
            duplicate_type.append('mangled_url')

        # initializing mangled fields as the original values - this will allow us to identify if these fields were mangled later on
        # when deciding if we need to insert another "duplicate" row
        mangled_address_1 = address_1
        mangled_desc = desc
        mangled_region = region
        mangled_country = country

        #create a duplicate address
        if dup_address == count:
            dup_address = random.randint(1, 3) + count
            duplicate_type.append('street_dup')
            lst = address_1.split(' ')
            for a in address_keys:
                if lst[len(lst)-1] in a:
                    new_street = a[random.randint(0, len(a)-1)]
                    del lst[len(lst)-1]
                    lst.append(new_street)
                    break
                    
            mangled_address_1 = ' '.join(lst)
            if count // dup_blank_stuff == 0:
                duplicate_type.append('blanked')
                mangled_desc = ''
                mangled_region = ''
                mangled_country = ''

        # if any of the fields were mangled, create a duplicate row in database and in training set CSV file
        if mangled_name != name or mangled_address_1 != address_1 or mangled_desc != desc or mangled_url != url or mangled_region != region or mangled_country != country:  
            dup_row = {
                'name': mangled_name,
                'description': mangled_desc,
                'url': mangled_url,
                'address_1': mangled_address_1,
                'address_2': address_2,
                'city': city,
                'region': mangled_region,
                'state': state,
                'postal': postal,
                'country': mangled_country,
                'duplicate_id': insert_id,
                'duplicate_type': duplicate_type,
                'training_set': 1
            }

            # name, description, url, address_1, address_2, city, state_province, postal_code, country, type, region, duplicate_id, duplicate_type, training_set
            
            with training_conn.cursor() as dup1_cur:
                dup1_cur.execute(insert_dup_qry, [
                        mangled_name,
                        mangled_desc,
                        mangled_url,
                        mangled_address_1,
                        address_2,
                        city,
                        state,
                        postal,
                        mangled_country,
                        type_row,
                        mangled_region,
                        insert_id,
                        '|'.join(duplicate_type),
                        1,
                    ])
                
    

            with open(csv_output, 'a') as csvfile:
                w = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
                w.writerow([
                    mangled_name,
                    mangled_desc,
                    mangled_url,
                    mangled_address_1,
                    address_2,
                    city,
                    state,
                    postal,
                    mangled_country,
                    type_row,
                    mangled_region,
                    insert_id,
                    '|'.join(duplicate_type),
                    1,
                ])
                
 
print ('finished')           
            

ModuleNotFoundError: No module named 'tld'