# Owner Analysis

Most of these functions are for the paper: 'Hiding in Housing'

In [4]:
from tqdm import tqdm
import json
import csv
import re
from fuzzywuzzy import fuzz

### Find One Property Owners
This function is used to find owners that own only one property.

In [None]:
def find_one_prop_owners(source):
    data = json.load(open(source))
    count = 0
    for owner in tqdm(data, total=len(data)):
        if owner[1] == 1:
            count += 1
    percentage = (count / len(data)) * 100
    print('There are ', count, 'one property owners in this dataset.')
    print("That's ", percentage, '% of total owners.')

In [None]:
find_one_prop_owners('./../../data_sets/sorted_landlords.json')

### Find owner_2
This function is used to find and count properties with 2 owners.

In [103]:
def get_owner_2_count(source, output):
    owners = []
    owner2_count = 0
    with open(source, mode="r") as csv_file:
        csv_reader = csv.DictReader(csv_file)
        line_count = 0
        for row in tqdm(csv_reader, total=581456):
            if line_count == 0:
                line_count += 1
            else:
                try:
                    owners.append([row["owner_1"].strip(), row["owner_2"].strip()])
                    if (row["owner_2"].strip() == ""):
                        owner2_count += 1
                except:
                     print(row["owner_1"].strip(), "is missing a count.")
    with open(output, 'w') as file:
        file.write(json.dumps(owners))
    print('There are ', owner2_count, 'owner_2s in this dataset.')

In [104]:
get_owner_2_count('./../../data_sets/opa_properties_public.csv', './data/owner1_owner2.json')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 581456/581456 [00:07<00:00, 74223.28it/s]


There are  376528 owner_2s in this dataset.


### Get Owners and Mailing Addresses
This function will create a list that holds both owners and the mailing address.

In [105]:
def get_owners_and_mailing_address(source, output):
    owners_with_mailing_address = []
    mailing_address_count = 0
    with open(source, mode="r") as csv_file:
        csv_reader = csv.DictReader(csv_file)
        line_count = 0
        owner_1_count = 0
        owner_2_count = 0
        mailing_address_1_count = 0
        mailing_address_2_count = 0
        mailing_care_of_count = 0
        mailing_city_state_count = 0
        mailing_street_count = 0
        mailing_zip_count = 0
        for row in tqdm(csv_reader, total=581456):
            if line_count == 0:
                line_count += 1
            else:
                line_count += 1
            
                owner_1 = row["owner_1"].strip()
                owner_2 = row["owner_2"].strip()
                mailing_address_1 = row["mailing_address_1"].strip()
                mailing_address_2 = row["mailing_address_2"].strip()
                mailing_care_of = row["mailing_care_of"].strip()
                mailing_city_state = row["mailing_city_state"].strip()
                mailing_street = row["mailing_street"].strip()
                mailing_zip = row["mailing_zip"].strip()
                mailing_info = [
                    mailing_address_1,
                    mailing_address_2,
                    mailing_care_of,
                    mailing_city_state,
                    mailing_street,
                    mailing_zip
                ]
                owners_with_mailing_address.append([owner_1, owner_2, mailing_info])
                if (owner_1 != ""):
                    owner_1_count += 1
                if (owner_2 != ""):
                    owner_2_count += 1
                if (mailing_address_1 != ""):
                    mailing_address_1_count += 1
                if (mailing_address_2 != ""):
                    mailing_address_2_count += 1
                if (mailing_care_of != ""):
                    mailing_care_of_count += 1
                if (mailing_city_state != ""):
                    mailing_city_state_count += 1
                if (mailing_street != ""):
                    mailing_street_count += 1
                if (mailing_zip != ""):
                    mailing_zip_count += 1

    with open(output, 'w') as file:
        file.write(json.dumps(owners_with_mailing_address))
    line_count -= 1 # need to account for the first line in the csv
    print('There are ', line_count, 'properties in this dataset.')
    print('There are ', owner_1_count, 'owner_1s in this dataset or ', (owner_1_count/line_count)*100, ' %.')
    print('There are ', owner_2_count, 'owner_2s in this dataset or ', (owner_2_count/line_count)*100, ' %.')
    print('There are ', mailing_address_1_count, 'mailing_address_1s in this dataset or ', (mailing_address_1_count/line_count)*100, ' %.')
    print('There are ', mailing_address_2_count, 'mailing_address_2s in this dataset or ', (mailing_address_2_count/line_count)*100, ' %.')
    print('There are ', mailing_care_of_count, 'mailing_care_ofs in this dataset or ', (mailing_care_of_count/line_count)*100, ' %.')
    print('There are ', mailing_city_state_count, 'mailing_city_states in this dataset or ', (mailing_city_state_count/line_count)*100, ' %.')
    print('There are ', mailing_street_count, 'mailing_streets in this dataset or ', (mailing_street_count/line_count)*100, ' %.')
    print('There are ', mailing_zip_count, 'mailing_zips in this dataset or ', (mailing_zip_count/line_count)*100, ' %.')
    
    

In [106]:
get_owners_and_mailing_address('./../../data_sets/opa_properties_public.csv', './data/owners_mailing_address.json')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 581456/581456 [00:08<00:00, 69535.78it/s]


There are  581455 properties in this dataset.
There are  581455 owner_1s in this dataset or  100.0  %.
There are  204927 owner_2s in this dataset or  35.24382798324892  %.
There are  32299 mailing_address_1s in this dataset or  5.554858071561858  %.
There are  11185 mailing_address_2s in this dataset or  1.9236226363175137  %.
There are  23098 mailing_care_ofs in this dataset or  3.972448426791411  %.
There are  220035 mailing_city_states in this dataset or  37.842137396703095  %.
There are  220037 mailing_streets in this dataset or  37.84248136141232  %.
There are  219783 mailing_zips in this dataset or  37.798797843341276  %.


### Combine Owner Truncation LLC findings
This combines the possible 'L's that are now LLCs

In [8]:
def combine(source, l_source, output):
    data = json.load(open(source))
    l_data = json.load(open(l_source))
    out = []
    for owner in tqdm(data, total=len(data)):
        owner_name = owner[0]
        prop_count = owner[1]
        found_flag = False
        for trunc_owner in l_data:
            if owner_name in trunc_owner.keys():
                correct_owner_name = trunc_owner[owner_name]
                out.append([correct_owner_name, prop_count])
                found_flag = True
                break
        if found_flag == False:
            out.append([owner_name, prop_count])
    json.dump(out, open(output, 'w+'))

In [9]:
combine('./../../data_sets/sorted_landlords.json', './data/annotated_fuzzy_L_results.JSON', './data/llc_owner_combined.json')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 429983/429983 [00:22<00:00, 19354.29it/s]


429983


####  Modified Combine to keep names
Needed for relating back to `landlords_and_properties.json'

In [2]:
def modified_combine(source, l_source, output):
    data = json.load(open(source))
    l_data = json.load(open(l_source))
    out = []
    for owner in tqdm(data, total=len(data)):
        owner_name = owner[0]
        prop_count = owner[1]
        found_flag = False
        for trunc_owner in l_data:
            if owner_name in trunc_owner.keys():
                correct_owner_name = trunc_owner[owner_name]
                out.append([correct_owner_name, owner_name, prop_count])
                found_flag = True
                break
        if found_flag == False:
            out.append([owner_name, prop_count])
    json.dump(out, open(output, 'w+'))

In [3]:
modified_combine('./../../data_sets/sorted_landlords.json', './data/annotated_fuzzy_L_results.JSON', './data/llc_owner_modified_combined.json')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 429983/429983 [00:22<00:00, 19162.35it/s]


### Find LLC Owners
This function will find LLC owners and count them.
It will take 'L' data from the LLCs that were truncated and found by hand, rather than the possibly incorrect 'L' assuming happening before this change.

In [12]:
def find_llc_owners(source, output):
    data = json.load(open(source))
    out = []
    count = 0
    property_count = 0
    LL_count = 0
    LLC_count = 0
    
    raw_search_string_LL = r"\b" + "LL" + r"\b$"
    raw_search_string_LLC = r"\b" + "LLC" + r"\b"
    
    for owner in tqdm(data, total=len(data)):
        match_output_LLC = re.search(raw_search_string_LLC, owner[0])
        match_output_LL = re.search(raw_search_string_LL, owner[0])
        
        if match_output_LLC is not None or match_output_LL is not None:
            out.append(owner)
            count += 1
            property_count += owner[1]
            
            if match_output_LLC is not None:
                LLC_count += 1
            if match_output_LL is not None:
                LL_count += 1

    json.dump(out, open(output, 'w+'))
    
    llc_owner_count_percentage = (count/len(data))*100
    property_count_percentage = (property_count / 581455)*100
    average_properties_owned = property_count / count
    
    possible_LLC_total = LLC_count + LL_count
    percentage_of_LLC = (LLC_count/possible_LLC_total) * 100
    percentage_of_LL = (LL_count/possible_LLC_total) * 100
    
    print('There are ', count, 'LLCs in this dataset or ', llc_owner_count_percentage, '% of owners.')
    print('LLCs own ', property_count, 'properties or ', property_count_percentage, '% of properties.') # hard number sourced from previous function
    print('On Average, LLCs own ', average_properties_owned, 'properties.')
    print('\n')
    print('Using Regex I found ', percentage_of_LLC, '% of the findings have "LLC" in their owner_1.')
    print('Using Regex I found ', percentage_of_LL, '% of the findings have "LL" at the end of their owner_1.')

In [13]:
find_llc_owners('./../../data_sets/sorted_landlords.json', './data/llc_owner.json')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 429983/429983 [00:00<00:00, 563813.20it/s]


There are  17489 LLCs in this dataset or  4.067370105329745 % of owners.
LLCs own  19768 properties or  3.3997471859387227 % of properties.
On Average, LLCs own  1.1303104808736921 properties.


Using Regex I found  95.64297558465321 % of the findings have "LLC" in their owner_1.
Using Regex I found  4.357024415346789 % of the findings have "LL" at the end of their owner_1.


### Generate JSON that Contains LLC Owners
Just trying to get an object that has every LLC I'm sure about.

In [13]:
def get_llc_and_props(source, llc_source, output):
    out = {}
    data = json.load(open(source))
    fixed_trunc_count = 0
    trunc_count = 0
    llc_data = json.load(open(llc_source))
    
    raw_search_string_LL = r"\b" + "LL" + r"\b$"
    raw_search_string_LLC = r"\b" + "LLC" + r"\b"
    
    for entry in tqdm(llc_data, total=len(llc_data)):
        llc_flag = False
        full_flag = False
        if len(entry) == 3:
            trunc_owner = entry[1]
            full_owner = entry[0]
            mo_LLC_trunc = re.search(raw_search_string_LLC, trunc_owner)
            mo_LL_trunc = re.search(raw_search_string_LL, trunc_owner)
            mo_LLC_full = re.search(raw_search_string_LLC, full_owner)
            mo_LL_full = re.search(raw_search_string_LL, full_owner)
            if mo_LLC_trunc is not None or mo_LL_trunc is not None or mo_LLC_full is not None or mo_LL_full is not None:
                llc_flag = True
                full_flag = True
        else:
            trunc_owner = entry[0]
            mo_LLC_trunc = re.search(raw_search_string_LLC, trunc_owner)
            mo_LL_trunc = re.search(raw_search_string_LL, trunc_owner)
            if mo_LLC_trunc is not None or mo_LL_trunc is not None:
                llc_flag = True
        
        if llc_flag is True:
            try:
                if full_flag is True:
                    data[trunc_owner]['has_full_name'] = True
                    data[trunc_owner]['full_name'] = full_owner
                    out[trunc_owner] = data[trunc_owner]
                    fixed_trunc_count += 1
                else:
                    data[trunc_owner]['has_full_name'] = False
                    out[trunc_owner] = data[trunc_owner]
                    trunc_count += 1
            except:
                pass
        else:
            pass
    json.dump(out, open(output, 'w+'))
    print("There were ", trunc_count + fixed_trunc_count, "entries processed.")
    print(trunc_count, "entries still truncated.")
    print(fixed_trunc_count, "entries with untruncated names.")

In [14]:
get_llc_and_props('./../../data_sets/landlords_and_properties.json', './data/llc_owner_modified_combined.json', './data/llc_data_set.json')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 429983/429983 [00:00<00:00, 527817.77it/s]


There were  17682 entries processed.
17489 entries still truncated.
193 entries with untruncated names.


## Looking for 'address' LLCs
Think like `3614 Spring Garden LLC`. These functions will be attempting to find them.

In [92]:
def narrow_to_address_llc(source, mail_source, output, prop_fuzz_ratio_threshold, mail_fuzz_ratio_threshold):
    out = {}
    address_out = {}
    data = json.load(open(source))
    mail_data = json.load(open(mail_source))
    numbered_count = 0
    fuzz_count = 0
    total_count = 0
    both_count = 0
    total_props_owned = 0
    fuzz_mail_count = 0
    
    raw_search_string_numbers = r"\d"
    
    for entry in tqdm(data, total=len(data)):
        numbered_flag = False
        prop_fuzz_flag = False
        
        numbered_LLC = re.search(raw_search_string_numbers, entry)
        if numbered_LLC is not None:
            numbered_flag = True
            numbered_count += 1
            
        property_fuzz_ratios = {}
        for prop in data[entry]['properties']:
            fuzz_ratio = fuzz.ratio(prop, entry)
            if fuzz_ratio >= prop_fuzz_ratio_threshold:
                property_fuzz_ratios[prop] = fuzz_ratio
                fuzz_count += 1
                prop_fuzz_flag = True
        
        if numbered_LLC is True or prop_fuzz_flag is True:
            total_count += 1
            out[entry] = data[entry]
            out[entry]['had_fuzz_props'] = False
            out[entry]['had_fuzz_mail'] = False
            if prop_fuzz_flag is True:
                out[entry]['had_fuzz_props'] = True
                out[entry]['fuzz_props'] = property_fuzz_ratios
                both_count += 1
            total_props_owned += data[entry]['total_properties']
            for mail_entry in mail_data:
                owner_1 = mail_entry[0]
                owner_2 = mail_entry[1]
                address_info = mail_entry[2]
                if entry == owner_1 or entry == owner_2:
                    street_addr = address_info[4]
                    fuzz_r = fuzz.ratio(entry, street_addr)
                    if fuzz_r >= mail_fuzz_ratio_threshold:
                        out[entry]['had_fuzz_mail'] = True
                        out[entry]['fuzz_mail'] = street_addr
                        out[entry]['fuzz_mail_ratio'] = fuzz_r
                        address_out[entry] = data[entry]
                        address_out[entry]['fuzz_mail'] = street_addr
                        address_out[entry]['fuzz_mail_ratio'] = fuzz_r
                        fuzz_mail_count += 1
            
    
    json.dump(out, open(output, 'w+'))
    json.dump(address_out, open('./data/fuzz_mailing_addressed_LLCs.json', 'w+'))
    print('There are ', numbered_count, 'numbered entries in the LLC data.')
    print('There are ', fuzz_count, 'fuzzy possible entries in the LLC data.')
    print('There are ', both_count, 'both possible entries in the LLC data.')
    print('There are ', fuzz_mail_count, 'with fuzzy similar mailing addresses.')
    print('There are ', total_count, 'possible address LLCs.')
    print('Thats ', total_props_owned, 'possible properties owned by address LLCs.')

In [96]:
narrow_to_address_llc('./data/llc_data_set.json', './../../data_sets/owners_mailing_address.json', './data/addressed_LLCs.json', 70, 60)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17682/17682 [02:28<00:00, 119.18it/s]


There are  5062 numbered entries in the LLC data.
There are  2654 fuzzy possible entries in the LLC data.
There are  2477 both possible entries in the LLC data.
There are  134 with fuzzy similar mailing addresses.
There are  2477 possible address LLCs.
Thats  3201 possible properties owned by address LLCs.


### n Letter Owners
A function to find owners  with n number of characters in their name.

In [9]:
def find_n_char_owners(source, output, n):
    out = {}
    data = json.load(open(source))
    count = 0
    for entry in tqdm(data, total=len(data)):
        owner_1_len = len(entry)
        if owner_1_len == n:
            out[entry] = data[entry]
            count += 1
    json.dump(out, open(output, 'w+'))
    print('There are ', count, 'owners with ', n, 'chars in their name.')

In [11]:
find_n_char_owners('./../../data_sets/landlords_and_properties.json', './data/n_char_owners/n_char_owners_3.json', 3)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 429982/429982 [00:00<00:00, 2975138.20it/s]


There are  6 owners with  3 chars in their name.
