TODO: write markdown

In [1]:
import csv, itertools, os, json, traceback
import datetime
import dotenv
import numpy as np 
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Dotenv
dotenv.load_dotenv()
PROJECT_DIR = os.getenv('PROJECT_DIR')

# Go to proper dir
os.chdir(PROJECT_DIR)

In [35]:
def get_gps_coords_one_line(json_entry):
    # Use GPS co-ords to process the postal address later
    latitude = json_entry['Latitude']
    longitude = json_entry['Longitude']

    return {
        'latitude': latitude, 
        'longitude': longitude
    }


def get_datetime_info_one_line(json_entry):
    # Process the datetimes of the image
    # Not all will have values for either value
    # Timezone causes failure and dont have time to process in multiple formats to replace before strip
    try:
        dto = datetime.datetime.strptime(json_entry['EXIF:DateTimeOriginal'], '%Y:%m:%d %H:%M:%S')
    except KeyError as e:
        dto = None
    except ValueError as e:
        dto = None

    try:
        mod_date = datetime.datetime.strptime(json_entry['EXIF:ModifyDate'], '%Y:%m:%d %H:%M:%S')
    except KeyError as e:
        mod_date = None
    except ValueError as e:
        mod_date = None

    # Process the time between when image was captured and when image was last modified
    # Might need to test this
    print(dto)
    if dto is not None and mod_date is not None:
        time_delta = mod_date - dto
    else:
        time_delta = None
    
    return {
        'date_time_original': dto, 
        'modified_date': mod_date, 
        'time_delta': time_delta
    }


def get_camera_info_one_line(json_entry):
    #
    try:
        camera_make = json_entry['EXIF:Make']
    except KeyError as e:
        camera_make = None
    try:
        camera_model = json_entry['EXIF:Model']
    except KeyError as e:
        camera_model = None
    try:
        software = json_entry['EXIF:Software']
    except KeyError as e:
        software = None
    try:
        shutter_speed = json_entry['EXIF:ShutterSpeedValue']
    except KeyError as e:
        shutter_speed = None
    try:
        flash = json_entry['EXIF:Flash']
    except KeyError as e:
        flash = None
    try:
        focal_length = json_entry['EXIF:FocalLength']
    except KeyError as e:
        focal_length = None
    try:
        lens_model = json_entry['EXIF:LensModel']
    except KeyError as e:
        lens_model = None
    
    return {
        'camera_make': camera_make,
        'camera_model': camera_model, 
        'software': software,
        'shutter_speed': shutter_speed, 
        'flash': flash, 
        'focal_length': focal_length, 
        'lens_model': lens_model
    }


def get_data_one_line(json_entry):
    # Identifies an image more accurately than the filename across different files
    data = {'image_id': json_entry['Image_ID']}
    # img_id = json_entry['Image_ID']
    # latitude, longitude = get_gps_coords_one_line(json_entry)
    data.update(get_gps_coords_one_line(json_entry))
    data.update(get_datetime_info_one_line(json_entry))
    # date_time_origial, mod_date, time_delta = get_datetime_info_one_line(json_entry)
    # This is against PEP8 but oh well!
    data.update(get_camera_info_one_line(json_entry))
    return data


def get_data(INFILE):
    data = []
    with open(INFILE, 'r') as in_file:
        rows = json.load(in_file)
        for row in rows:
            data.append(get_data_one_line(row))
    return data
    

In [4]:
# Geolocator stuff
geolocator = Nominatim(user_agent="test_app")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) # Delay needed or else will run into problems

In [14]:
INFILE = f'{PROJECT_DIR}/exif_data/exif_data_11_29_first_1000.json'
OUTFILE_ADDRESSES = './output_addresses.csv'

In [44]:
OUTFILE_CAMERA_INFO = f'{PROJECT_DIR}/output_camera_info.csv'
OUTFILE_DATETIME_INFO = f'{PROJECT_DIR}/output_datetime_info.csv'
camera_info_headers = ['image_id', 'camera_make', 'camera_model', 'software', 'shutter_speed', 'flash', 
                       'focal_length', 'lens_model']
datetime_info_headers = ['image_id', 'date_time_original', 'modified_date', 'time_delta']

def write_camera_data(data):
    if os.path.isfile(OUTFILE_CAMERA_INFO):
        read_mode = 'a'
    else:
        read_mode = 'w'

    with open(OUTFILE_CAMERA_INFO, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            writer.writerow(camera_info_headers)
            
        for entry in data:
            values = [entry.get(header) for header in camera_info_headers]
            writer.writerow(values)


def write_datetime_data(data):
    if os.path.isfile(OUTFILE_DATETIME_INFO):
        read_mode = 'a'
    else:
        read_mode = 'w'

    with open(OUTFILE_DATETIME_INFO, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            writer.writerow(datetime_info_headers)
            
        for entry in data:
            values = [entry.get(header) for header in datetime_info_headers]
            writer.writerow(values)


In [45]:
data = get_data(INFILE)
write_camera_data(data)
write_datetime_data(data)

2016-05-24 08:43:59
2015-10-28 06:58:48
None
2013-07-13 01:12:58
2013-01-26 07:59:13
2014-10-25 21:10:16
2013-08-24 05:50:51
2016-09-03 03:59:47
2014-05-25 04:59:00
None
2012-04-19 05:58:36
2015-10-28 06:54:01
2013-08-17 23:39:55
2014-05-25 04:58:45
None
2016-06-06 01:57:13
2013-10-19 02:46:01
2014-06-01 05:15:25
2013-08-24 06:06:08
2014-08-24 07:04:43
None
2009-05-16 16:54:36
2013-06-01 00:18:40
2014-05-30 23:42:16
2014-10-25 20:21:43
None
2015-05-31 15:46:34
2016-07-07 07:18:30
2016-07-07 06:53:25
None
2018-07-31 13:13:19
2013-08-24 06:13:13
2014-03-04 00:22:31
2010-09-22 00:46:34
None
None
2014-10-25 05:38:48
2016-04-23 23:03:03
2012-06-22 10:01:00
2014-05-30 23:23:33
2016-07-14 18:01:57
2013-12-31 04:59:01
None
2013-06-22 23:36:52
None
2016-07-14 19:05:03
2013-08-24 06:03:56
2013-01-26 07:29:34
2013-08-24 05:50:51
2014-03-03 06:13:45
2014-12-21 08:17:38
None
None
2016-05-29 01:42:39
2011-09-26 13:54:59
None
2017-05-01 15:56:23
None
2015-08-08 12:39:39
2015-06-15 13:09:53
2014-02-07

In [46]:
dates_data = pd.read_csv(OUTFILE_DATETIME_INFO)
print(dates_data)

        image_id   date_time_original        modified_date          time_delta
0    27653319031  2016-05-24 08:43:59  2016-06-17 08:30:16   23 days, 23:46:17
1    22697429926  2015-10-28 06:58:48  2015-11-03 10:05:15     6 days, 3:06:27
2     6650348617                  NaN                  NaN                 NaN
3    12337695235  2013-07-13 01:12:58  2014-02-06 18:52:24  208 days, 17:39:26
4     8432503596  2013-01-26 07:59:13  2013-01-31 21:08:24    5 days, 13:09:11
..           ...                  ...                  ...                 ...
994   7725570652                  NaN                  NaN                 NaN
995   8371944547  2011-07-24 12:29:18  2013-01-12 11:15:54  537 days, 22:46:36
996  31368823864  2016-06-21 14:25:21  2016-06-21 14:25:21             0:00:00
997  24603827750                  NaN                  NaN                 NaN
998  16031695953  2014-04-09 08:18:32  2014-04-09 08:18:32             0:00:00

[999 rows x 4 columns]


In [48]:
camera_data = pd.read_csv(OUTFILE_CAMERA_INFO)
print(camera_data)

        image_id        camera_make           camera_model  \
0    27653319031              Canon  Canon EOS 5D Mark III   
1    22697429926  NIKON CORPORATION             NIKON D800   
2     6650348617                NaN                    NaN   
3    12337695235  NIKON CORPORATION             NIKON D800   
4     8432503596  NIKON CORPORATION             NIKON D800   
..           ...                ...                    ...   
994   7725570652                NaN                    NaN   
995   8371944547  NIKON CORPORATION              NIKON D90   
996  31368823864  NIKON CORPORATION            NIKON D3300   
997  24603827750                NaN                    NaN   
998  16031695953  NIKON CORPORATION            NIKON D5100   

                                        software  shutter_speed  flash  \
0    Adobe Photoshop Lightroom 6.5.1 (Macintosh)       0.001000   16.0   
1    Adobe Photoshop Lightroom 6.2.1 (Macintosh)       0.002500   16.0   
2                                

In [6]:
csv_headers = ['amenity', 'building', 'tourism', 'place', 'house_number', 'emergency', 'leisure', 'quarter',
              'highway', 'historic', 'man_made', 'natural', 'road', 'farm', 'isolated_dwelling', 'neighbourhood', 
              'residential', 'suburb', 'city_district', 'town', 'locality', 'city', 'hamlet', 'village', 
              'municipality', 'county', 'state_district', 'district', 'province', 'state', 'region', 'postcode', 'country', 
              'country_code']
empty_address_dict = {}
for key in csv_headers:
    empty_address_dict[key] = None

illegal_country_codes = ['eg'] # Arabic scripts not supported
# missing_from_illegal_country_codes = []

# TODO: test this still works with a SMALLER file. the 1000 lie one takes about 10 mins
def write_address_data(data):
    ctr = 0
    if os.path.isfile(OUTFILE_ADDRESSES):
        read_mode = 'w'
    else:
        read_mode = 'a'

    with open(OUTFILE_ADDRESSES, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            header_row = itertools.chain(['image_id'], csv_headers)
            writer.writerow(header_row)

        for row in data:
            img_id = row['image_id']
            try: 
                # This is where most of the time running this will be taken
                address_data = geolocator.reverse((row['latitude'], row['longitude'])).raw['address']
                this_address_dict = empty_address_dict.copy()
                # Some images don't have country codes
                try:
                    country_code = address_data['country_code']
                except KeyError as key_exc:
                    country_code = None

                if country_code not in illegal_country_codes:
                    # There's probably a better way to do this, but I don't care
                    for key, value in address_data.items():
                        this_address_dict[key] = value
                    out_row = itertools.chain(img_id, this_address_dict.values())
                    print(f"Wrote row img {img_id} to #{ctr}")
                    writer.writerow(out_row)
                else:
                    print(f'Skipping write for image with id,country code {img_id},{country_code}')
                    # missing_from_illegal_country_codes.append(img_id)
            except Exception as e:
                print(f'Write failed for image with id {img_id}')

            ctr+=1

In [None]:
# TODO: test this, see above
write_address_data(data)

In [6]:
# Here are all the unique keys from geolocator.reverse(s).raw:
#
# place_id
# licence
# osm_type
# osm_id
# lat
# lon
# display_name
# address
# boundingbox

In [49]:
# TODO: once the write_address_data func is tested, this cell can be deleted

# illegal_country_codes = ['eg']
# missing_from_illegal_country_codes = []
# # TODO make this conditional on file existing
# if True:
#     mode = 'w'

# EXIF_DATA_FILE = f'{PROJECT_DIR}/exif_data/exif_data_11_29_first_1000.json'
# OUTPUT_FILE = f'{PROJECT_DIR}/output.csv'
# ctr = 0
# with open(EXIF_DATA_FILE, 'r') as in_file:
#     with open(OUTPUT_FILE, mode) as out_file:
#         writer = csv.writer(out_file)
#         if mode == 'w':
#                 header_row = itertools.chain(['image_id'], csv_headers)
#                 writer.writerow(header_row)  

#         rows = json.load(in_file)
#         for row in rows:
            
#             img_id = row['Image_ID']
#             latitude = row['Latitude']
#             longitude = row['Longitude']
              
#             try:
#                 address_data = geolocator.reverse((latitude, longitude)).raw['address']
#                 this_address_dict = address_dict.copy()
#                 # Some images don't have country codes
#                 try:
#                     country_code = address_data['country_code']
#                 except KeyError as key_exc:
#                     country_code = None

#                 if country_code not in illegal_country_codes:
#                     # There's probably a better way to do this, but I don't care
#                     for key, value in address_data.items():
#                         this_address_dict[key] = value
#                     out_row = itertools.chain([img_id], this_address_dict.values())
#                     print(f"Wrote row img {img_id} to #{ctr}")
#                     writer.writerow(out_row)
#                 else:
#                     print(f'Skipping write for image with id,country code {img_id},{country_code}')
#                     missing_from_illegal_country_codes.append(img_id)
#             except Exception as e:
#                 print(f'Write failed for image with id {img_id}')
#                 # traceback.print_exc()
#                 # exit()

#             ctr+=1

In [9]:
# DO NOT DELETE THIS

# amenity
# building
# tourism
# place (before house_number)
# house_number
# emergency, leisure
# quarter
# man_made, highway, historic, natural (only indication is before road)
# highway (only indicatio is before road)
# natural (only indicator is before road)
# historic comes before road
# road
# farm (after road, before city_district)
# isolated_dwelling (between road and city)
# neighbourhood
# residential (before suburb)
# suburb
# city_district (before town, after suburb)
# town
# locality
# city
# hamlet
# village (sometimes before city, sometimes after)
# municipality
# county
# state_district
# province (before state or region)
# state
# region (after state_district, state - before postcode)
# postcode
# country
# # country_code

# csv_headers = ['amenity', 'building', 'tourism', 'place', 'house_number', 'emergency', 'leisure', 'quarter',
#               'highway', 'historic', 'man_made', 'natural', 'road', 'farm', 'isolated_dwelling', 'neighbourhood', 
#               'residential', 'suburb', 'city_district', 'town', 'locality', 'city', 'hamlet', 'village', 
#               'municipality', 'county', 'state_district', 'district', 'province', 'state', 'region', 'postcode', 'country', 
#               'country_code']