# Data Cleaning Tools
These tools are used to clean up the data set.

### Imports
`import json` is used to work with JSON files and objects.

`from tqdm import tqdm` is used to keep progress on lengthy operations.

`from fuzzywuzzy import fuzz` is used to get the distance between strings for fuzzy string matching.

`from fuzzywuzzy import fuzz` is not in use currently but may be used by `fuzz`.

In [1]:
import json
from tqdm import tqdm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

### Fuzzy String Match on owner_1
This function is used to see if the owner_1 has any potential duplicates in the data, as some entries are misspelled or truncated differently. To do this effeciently I used it in tandem with `print_property_addresses_by_owner` (in data tools) and looked a few addresses up in Philly Atlas.

In [2]:
def fuzzy_string_match_on_owner_1(source, string_to_match):
    data = json.load(open(source))
    owner_1s = []
    print('Getting Owners...')
    for owner in tqdm(data, total=len(data)):
        owner_1s.append(owner[0])
    ratios = []
    print('uwu fuzzy wuzzy at werk uwu....')
    for owner in tqdm(owner_1s, total=len(owner_1s)):
        score = fuzz.ratio(string_to_match, owner)
        if score > 80:
            ratios.append([owner, score])
    ratios.sort(key=lambda x: x[1], reverse=True)
    output = './data_sets/fuzzy_string_ratios/' + string_to_match + '.json'
    json.dump(ratios, open(output, 'w'))

In [None]:
fuzzy_string_match_on_owner_1('./data_sets/unique_landlords.json', 'PHILADELPHIA HOUSING AUTH')

### Merge Same Owners
Using the results from `fuzzy_string_match_on_owner_1` you can merge two entries together. Updating only their property count and object containing their properties.

In [3]:
def merge_same_owners(source, correct_entry, entry_to_merge):
    data = json.load(open(source))
    correct_owner_object = {}
    to_merge_owner_object = {}
    print('Finding correct owner objects...')
    for owner in tqdm(data, total=len(data)):
        if owner == correct_entry:
            correct_owner_object = data[owner]
        if owner == entry_to_merge:
            to_merge_owner_object = data[owner]
    del data[correct_entry]
    del data[entry_to_merge]
    original_count_property = correct_owner_object["total_properties"]
    expected_count_property = correct_owner_object["total_properties"] + to_merge_owner_object["total_properties"]
    correct_owner_object["total_properties"] += to_merge_owner_object["total_properties"]
    print('Merging properties...')
    for property in tqdm(to_merge_owner_object["properties"], total=len(to_merge_owner_object["properties"])):
        correct_owner_object["properties"][property] = to_merge_owner_object["properties"][property]
    data[correct_entry] = correct_owner_object

    print('Original Property Count:',  original_count_property)
    print('Expected Property Count:', expected_count_property)
    print('Actual Property Count:', correct_owner_object["total_properties"])

    json.dump(data, open(source, 'w'))

In [None]:
merge_same_owners('./data_sets/landlords_and_properties.json', 'JAMISON ROAD ASSOCIATES L','JAMISON ROAD ASSOC')

### Truncate to Top n
This is used to truncate how many owner_1s are displayed in the results of `get_single_zip_counts()`.

In [4]:
def truncate_to_top_n(source, top_n):
    data = json.load(open(source))
    zip_dict = {}
    for zip_code in tqdm(data, total=len(data)):
        counts = data[zip_code]
        top = {}
        for count in list(counts)[0:top_n + 1]:
            top[count] = counts[count]
        zip_dict[zip_code] = top
    output = 'top_' + str(top_n) + '_in_all_zips.json'
    json.dump(zip_dict, open(output, 'w'))

In [None]:
truncate_to_top_n('top_in_zip_full.json', 20)