# Quiz 1

In [15]:
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up. In the first exercise we want you to audit
the datatypes that can be found in some particular fields in the dataset.
The possible types of values can be:
- NoneType if the value is a string "NULL" or an empty string ""
- list, if the value starts with "{"
- int, if the value can be cast to int
- float, if the value can be cast to float, but CANNOT be cast to int.
   For example, '3.23e+07' should be considered a float because it can be cast
   as float but int('3.23e+07') will throw a ValueError
- 'str', for all other values

The audit_file function should return a dictionary containing fieldnames and a 
SET of the types that can be found in the field. e.g.
{"field1": set([type(float()), type(int()), type(str())]),
 "field2": set([type(str())]),
  ....
}
The type() function returns a type object describing the argument given to the 
function. You can also use examples of objects to create type objects, e.g.
type(1.1) for a float: see the test function below for examples.

Note that the first three rows (after the header row) in the cities.csv file
are not actual data points. The contents of these rows should note be included
when processing data types. Be sure to include functionality in your code to
skip over or detect these rows.
"""
import codecs
import csv
import json
import pprint
from collections import defaultdict

CITIES = 'data/cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]

In [33]:
with open(CITIES, 'r') as file:
    reader = csv.reader(file)
    for i in range(10):
        print(next(reader))
        print('-'*100)
        print()

['URI', 'rdf-schema#label', 'rdf-schema#comment', 'administrativeDistrict_label', 'administrativeDistrict', 'anthem_label', 'anthem', 'area', 'areaCode', 'areaLand', 'areaMetro', 'areaRural', 'areaTotal', 'areaUrban', 'areaWater', 'city_label', 'city', 'code', 'country_label', 'country', 'daylightSavingTimeZone_label', 'daylightSavingTimeZone', 'district_label', 'district', 'division_label', 'division', 'elevation', 'federalState_label', 'federalState', 'foundingDate', 'foundingPerson_label', 'foundingPerson', 'foundingYear', 'governingBody_label', 'governingBody', 'government_label', 'government', 'governmentType_label', 'governmentType', 'isPartOf_label', 'isPartOf', 'isoCodeRegion_label', 'isoCodeRegion', 'leader_label', 'leader', 'leaderName_label', 'leaderName', 'leaderParty_label', 'leaderParty', 'leaderTitle', 'location_label', 'location', 'maximumElevation', 'mayor_label', 'mayor', 'minimumElevation', 'motto', 'municipality_label', 'municipality', 'part_label', 'part', 'percent

In [40]:
fieldtypes = defaultdict(set)

with open(CITIES, 'r') as file:
    reader = csv.DictReader(file)
    for _ in range(3):
        next(reader)
    for line in reader:
        for f in FIELDS:
            if (line[f] == "NULL") or (line[f] == ""):
                fieldtype = type(None)
            elif line[f].startswith('{'):
                fieldtype = type(list())
            else:
                try:
                    fieldvalue = int(line[f])
                    fieldtype = type(int())
                except ValueError:
                    try:
                        fieldvalue = float(line[f])
                        fieldtype = type(float())
                    except ValueError:
                        fieldtype = type(str())
            fieldtypes[f].add(fieldtype)
fieldtypes = dict(fieldtypes)

In [26]:
fieldtypes

{'name': {NoneType, list, str},
 'timeZone_label': {NoneType, list, str},
 'utcOffset': {NoneType, float, int, list, str},
 'homepage': {NoneType, list, str},
 'governmentType_label': {NoneType, list, str},
 'isPartOf_label': {NoneType, list, str},
 'areaCode': {NoneType, int, list, str},
 'populationTotal': {NoneType, int, list, str},
 'elevation': {NoneType, float, list, str},
 'maximumElevation': {NoneType, float, list, str},
 'minimumElevation': {NoneType, float, str},
 'populationDensity': {NoneType, float, list, str},
 'wgs84_pos#lat': {NoneType, float, list, str},
 'wgs84_pos#long': {NoneType, float, list, str},
 'areaLand': {NoneType, float, list, str},
 'areaMetro': {NoneType, float, list, str},
 'areaUrban': {NoneType, float, list, str}}

In [80]:
def audit_file(filename, fields):
    fieldtypes = defaultdict(set)

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        # Skip the first 3 lines
        for _ in range(3):
            next(reader)
        
        for line in reader:
            for f in fields:
                if (line[f] == "NULL") or (line[f] == ""):
                    fieldtype = type(None)
                elif line[f].startswith('{'):
                    fieldtype = type(list())
                else:
                    try:
                        fieldvalue = int(line[f])
                        fieldtype = type(int())
                    except ValueError:
                        try:
                            fieldvalue = float(line[f])
                            fieldtype = type(float())
                        except ValueError:
                            fieldtype = type(str())
                fieldtypes[f].add(fieldtype)
    fieldtypes = dict(fieldtypes)
    
    return fieldtypes

In [81]:
def test():
    fieldtypes = audit_file(CITIES, FIELDS)

    pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    
test()

{'areaCode': {<class 'NoneType'>, <class 'str'>, <class 'int'>},
 'areaLand': {<class 'NoneType'>, <class 'float'>, <class 'list'>},
 'areaMetro': {<class 'NoneType'>, <class 'float'>},
 'areaUrban': {<class 'NoneType'>, <class 'float'>},
 'elevation': {<class 'NoneType'>, <class 'list'>, <class 'float'>},
 'governmentType_label': {<class 'NoneType'>, <class 'str'>},
 'homepage': {<class 'NoneType'>, <class 'str'>},
 'isPartOf_label': {<class 'NoneType'>, <class 'str'>, <class 'list'>},
 'maximumElevation': {<class 'NoneType'>},
 'minimumElevation': {<class 'NoneType'>},
 'name': {<class 'NoneType'>, <class 'str'>, <class 'list'>},
 'populationDensity': {<class 'NoneType'>, <class 'float'>, <class 'list'>},
 'populationTotal': {<class 'NoneType'>, <class 'int'>},
 'timeZone_label': {<class 'NoneType'>, <class 'str'>},
 'utcOffset': {<class 'list'>,
               <class 'int'>,
               <class 'NoneType'>,
               <class 'str'>},
 'wgs84_pos#lat': {<class 'float'>},
 'wgs8

# Quiz 2

In [82]:
fieldtypes = defaultdict(set)

with open(CITIES, 'r') as file:
    reader = csv.DictReader(file)
    for _ in range(3):
        next(reader)
    for line in reader:
        for f in FIELDS:
            if (line[f] == "NULL") or (line[f] == ""):
                fieldtype = type(None)
            elif line[f].startswith('{'):
                if f == 'areaLand':
                    print(line[f])
                fieldtype = type(list())
            else:
                try:
                    fieldvalue = int(line[f])
                    fieldtype = type(int())
                except ValueError:
                    try:
                        fieldvalue = float(line[f])
                        fieldtype = type(float())
                    except ValueError:
                        fieldtype = type(str())
            fieldtypes[f].add(fieldtype)
fieldtypes = dict(fieldtypes)

{1.01787e+08|1.019e+08}
{3.15979e+07|3.17e+07}
{5.51667e+07|5.53e+07}
{6.36e+07|6.37137e+07}
{3.78138e+07|3.79e+07}
{2.02e+07|2.02019e+07}
{2.87489e+08|2.875e+08}
{2.5355e+07|2.5356e+07}
{2.512e+08|2.51229e+08}
{8.25e+07|8.26206e+07}
{4.48e+06|4.48068e+06}
{1.458e+07|1.45816e+07}
{1.71198e+07|1.712e+07}
{2.07e+06|2.07199e+06}
{4.61e+06|4.61018e+06}
{3.26e+06|3.26339e+06}
{9.057e+07|9.05719e+07}
{1.274e+07|1.27427e+07}
{3.136e+07|3.13648e+07}
{5.31466e+07|5.315e+07}
{3.43173e+07|3.432e+07}
{5.33538e+06|5.34e+06}
{8.184e+07|8.18436e+07}
{1.13959e+07|1.14e+07}
{2.056e+07|2.05645e+07}
{9.782e+07|9.78239e+07}
{4.94688e+06|4.95e+06}
{1.20175e+07|1.202e+07}


It would be reasonable to keep the value with the most significant digits.

# Quiz 3

In [83]:
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up.

Since in the previous quiz you made a decision on which value to keep for the
"areaLand" field, you now know what has to be done.

Finish the function fix_area(). It will receive a string as an input, and it
has to return a float representing the value of the area or None.
You have to change the function fix_area. You can use extra functions if you
like, but changes to process_file will not be taken into account.
The rest of the code is just an example on how this function can be used.
"""
import codecs
import csv
import json
import pprint

CITIES = 'data/cities.csv'

In [84]:
area = '{1.20175e+07|1.202e+07}'

In [85]:
area_list = area.strip('{}').split('|')
area_list

['1.20175e+07', '1.202e+07']

In [86]:
area_str = area_list[0]
area_str

'1.20175e+07'

In [87]:
area_str.split('e')[0].rstrip('0').replace('.', '')

'120175'

In [88]:
len(area_str.split('e')[0].rstrip('0').replace('.', ''))

6

In [89]:
def num_significant(area_str):
    return len(area_str.split('e')[0].rstrip('0').replace('.', ''))

In [90]:
def fix_area(area):
    if not area.startswith('{'):
        try:
            return float(area)
        except ValueError:
            return None
    area_list = area.strip('{}').split('|')
    area = None
    current_significant = 0
    for area_str in area_list:
        try:
            candidate = float(area_str)
            candidate_significant = num_significant(area_str)
            if candidate_significant > current_significant:
                area = candidate
                current_significant = candidate_significant
        except ValueError:
            pass  # Keep the previous values
    return area

In [91]:
fix_area(area)

12017500.0

In [92]:
def process_file(filename):
    # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
    data = []

    with open(filename, "r") as f:
        reader = csv.DictReader(f)

        #skipping the extra metadata
        for i in range(3):
            l = next(reader) #.next()

        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "areaLand" in line:
                line["areaLand"] = fix_area(line["areaLand"])
            data.append(line)

    return data


def test():
    data = process_file(CITIES)

    print("Printing three example results:")
    for n in range(5,8):
        pprint.pprint(data[n]["areaLand"])

    assert data[3]["areaLand"] == None        
    assert data[8]["areaLand"] == 55166700.0
    assert data[20]["areaLand"] == 14581600.0
    assert data[33]["areaLand"] == 20564500.0    


if __name__ == "__main__":
    test()

Printing three example results:
None
101787000.0
31597900.0


# Quiz 4

In [93]:
interesting = ['name', 'populationTotal', 'areaMetro', 'postalCode']

In [94]:
fieldtypes = audit_file(CITIES, FIELDS + ['postalCode',])

pprint.pprint({k:v for k,v in fieldtypes.items() if k in interesting})

{'areaMetro': {<class 'NoneType'>, <class 'float'>},
 'name': {<class 'NoneType'>, <class 'str'>, <class 'list'>},
 'populationTotal': {<class 'NoneType'>, <class 'int'>},
 'postalCode': {<class 'NoneType'>, <class 'str'>, <class 'int'>}}


# Quiz 5

In [96]:
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up.

In the previous quiz you recognized that the "name" value can be an array (or
list in Python terms). It would make it easier to process and query the data
later if all values for the name are in a Python list, instead of being
just a string separated with special characters, like now.

Finish the function fix_name(). It will recieve a string as an input, and it
will return a list of all the names. If there is only one name, the list will
have only one item in it; if the name is "NULL", the list should be empty.
The rest of the code is just an example on how this function can be used.
"""
import codecs
import csv
import pprint

CITIES = 'data/cities.csv'

In [102]:
def fix_name(name):
    if name.startswith('{'):
        return name.strip('{}').split('|')
    if name == 'NULL':
        return list()
    return [name, ]

In [103]:
def process_file(filename):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra metadata
        for i in range(3):
            l = next(reader)
        # processing file
        for line in reader:
            # calling your function to fix the area value
            if "name" in line:
                line["name"] = fix_name(line["name"])
            data.append(line)
    return data


def test():
    data = process_file(CITIES)

    print("Printing 20 results:")
    for n in range(20):
        pprint.pprint(data[n]["name"])

    assert data[14]["name"] == ['Negtemiut', 'Nightmute']
    assert data[9]["name"] == ['Pell City Alabama']
    assert data[3]["name"] == ['Kumhari']

if __name__ == "__main__":
    test()

Printing 20 results:
['Kud']
['Kuju']
['Kumbhraj']
['Kumhari']
['Kunigal']
['Kurgunta']
['Athens']
['Demopolis']
['Chelsea Alabama']
['Pell City Alabama']
['City of Northport']
['Sand Point']
['Unalaska Alaska']
['City of Menlo Park']
['Negtemiut', 'Nightmute']
['Fairbanks Alaska']
['Homer']
['Ketchikan Alaska']
['Nuniaq', 'Old Harbor']
['Rainier Washington']


# Quiz 6

In [104]:
"""
In this problem set you work with cities infobox data, audit it, come up with a
cleaning idea and then clean it up.

If you look at the full city data, you will notice that there are couple of
values that seem to provide the same information in different formats: "point"
seems to be the combination of "wgs84_pos#lat" and "wgs84_pos#long". However,
we do not know if that is the case and should check if they are equivalent.

Finish the function check_loc(). It will recieve 3 strings: first, the combined
value of "point" followed by the separate "wgs84_pos#" values. You have to
extract the lat and long values from the "point" argument and compare them to
the "wgs84_pos# values, returning True or False.

Note that you do not have to fix the values, only determine if they are
consistent. To fix them in this case you would need more information. Feel free
to discuss possible strategies for fixing this on the discussion forum.

The rest of the code is just an example on how this function can be used.
Changes to "process_file" function will not be taken into account for grading.
"""
import csv
import pprint

CITIES = 'data/cities.csv'

In [128]:
def check_loc(point, lat, longi):
    point_list = [float(s) for s in point.split()]
    lat = float(lat)
    longi = float(longi)
    same = (point_list[0] == lat) and (point_list[1] == longi)
    #same = (abs(point_list[0] - lat) < 1e-6 * abs(lat))\
    #    and (abs(point_list[1] - longi) < 1e-6 * abs(longi))
    return same

In [129]:
with open(CITIES, 'r') as file:
    reader = csv.DictReader(file)
    for i in range(3):
        next(reader)
    for row in reader:
        print('point: {} - lat: {} - longi: {} \n Result: {}\n'.format(
            row['point'],
            row['wgs84_pos#lat'],
            row['wgs84_pos#long'],
            check_loc(row['point'], row['wgs84_pos#lat'], row['wgs84_pos#long'])
        ))

point: 33.08 75.28 - lat: 33.08 - longi: 75.28 
 Result: True

point: 23.72 85.5 - lat: 23.72 - longi: 85.5 
 Result: True

point: 24.37 77.05 - lat: 24.37 - longi: 77.05 
 Result: True

point: 21.27 81.52 - lat: 21.27 - longi: 81.52 
 Result: True

point: 13.02 77.03 - lat: 13.02 - longi: 77.03 
 Result: True

point: 17.2 77.35 - lat: 17.2 - longi: 77.35 
 Result: True

point: 34.789722222222224 -86.96944444444445 - lat: 34.7897 - longi: -86.9694 
 Result: False

point: 32.50944444444445 -87.83722222222222 - lat: 32.5094 - longi: -87.8372 
 Result: False

point: 33.329166666666666 -86.65083333333334 - lat: 33.3292 - longi: -86.6508 
 Result: False

point: 33.57083333333333 -86.27388888888889 - lat: 33.5708 - longi: -86.2739 
 Result: False

point: 33.25388888888889 -87.59222222222222 - lat: 33.2539 - longi: -87.5922 
 Result: False

point: 55.336666666666666 -160.49333333333334 - lat: 55.3367 - longi: -160.493 
 Result: False

point: 53.888888888888886 -166.52722222222224 - lat: 53.88

In [130]:
def process_file(filename):
    data = []
    with open(filename, "r") as f:
        reader = csv.DictReader(f)
        #skipping the extra matadata
        for i in range(3):
            l = reader.next()
        # processing file
        for line in reader:
            # calling your function to check the location
            result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])
            if not result:
                print("{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"]))
            data.append(line)

    return data


def test():
    assert check_loc("33.08 75.28", "33.08", "75.28") == True
    assert check_loc("44.57833333333333 -91.21833333333333", "44.5783", "-91.2183") == False

if __name__ == "__main__":
    test()