TODO: write markdown

In [1]:
import csv, itertools, os, json, traceback
import datetime
import dotenv
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Dotenv
dotenv.load_dotenv()
PROJECT_DIR = os.getenv('PROJECT_DIR')

# Go to proper dir
os.chdir(PROJECT_DIR)

In [3]:
def get_gps_coords_one_line(json_entry):
    # Use GPS co-ords to process the postal address later
    latitude = json_entry['Latitude']
    longitude = json_entry['Longitude']

    return {
        'latitude': latitude, 
        'longitude': longitude
    }


def get_datetime_info_one_line(json_entry):
    # Process the datetimes of the image
    # Not all will have values for either value
    # Timezone causes failure and dont have time to process in multiple formats to replace before strip
    # print(json_entry)
    # exit()
    try:
        dto = datetime.datetime.strptime(json_entry['EXIF:DateTimeOriginal'], '%Y:%m:%d %H:%M:%S')
    except KeyError as e:
        dto = None
    except ValueError as e:
        dto = None

    try:
        mod_date = datetime.datetime.strptime(json_entry['EXIF:ModifyDate'], '%Y:%m:%d %H:%M:%S')
    except KeyError as e:
        mod_date = None
    except ValueError as e:
        mod_date = None

    # Process the time between when image was captured and when image was last modified
    # Might need to test this
    # print(dto)
    if dto is not None and mod_date is not None:
        time_delta = mod_date - dto
    else:
        time_delta = None
    
    return {
        'date_time_original': dto, 
        'modified_date': mod_date, 
        'time_delta': time_delta
    }


def get_camera_info_one_line(json_entry):
    #
    try:
        camera_make = json_entry['EXIF:Make']
    except KeyError as e:
        camera_make = None
    try:
        camera_model = json_entry['EXIF:Model']
    except KeyError as e:
        camera_model = None
    try:
        software = json_entry['EXIF:Software']
    except KeyError as e:
        software = None
    try:
        shutter_speed = json_entry['EXIF:ShutterSpeedValue']
    except KeyError as e:
        shutter_speed = None
    try:
        flash = json_entry['EXIF:Flash']
    except KeyError as e:
        flash = None
    try:
        focal_length = json_entry['EXIF:FocalLength']
    except KeyError as e:
        focal_length = None
    try:
        lens_model = json_entry['EXIF:LensModel']
    except KeyError as e:
        lens_model = None
    
    return {
        'camera_make': camera_make,
        'camera_model': camera_model, 
        'software': software,
        'shutter_speed': shutter_speed, 
        'flash': flash, 
        'focal_length': focal_length, 
        'lens_model': lens_model
    }


def get_data_one_line(json_entry):
    # Identifies an image more accurately than the filename across different files
    data = {'image_id': json_entry['Image_ID']}
    # latitude, longitude = get_gps_coords_one_line(json_entry)
    data.update(get_gps_coords_one_line(json_entry))
    data.update(get_datetime_info_one_line(json_entry))
    # date_time_origial, mod_date, time_delta = get_datetime_info_one_line(json_entry)
    data.update(get_camera_info_one_line(json_entry))
    return data


def get_data(INFILE):
    data = []
    with open(INFILE, 'r') as in_file:
        rows = json.load(in_file)
        for row in rows:
            data.append(get_data_one_line(row))
    return data
    

In [4]:
# Geolocator stuff
geolocator = Nominatim(user_agent="test_app")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) # Delay needed or else will run into problems

In [5]:
INFILE = f'{PROJECT_DIR}/exif_data/exif_data.json'
OUTFILE_ADDRESSES = './output_addresses.csv'

In [6]:
OUTFILE_CAMERA_INFO = f'{PROJECT_DIR}/output_camera_info.csv'
OUTFILE_DATETIME_INFO = f'{PROJECT_DIR}/output_datetime_info.csv'
camera_info_headers = ['image_id', 'camera_make', 'camera_model', 'software', 'shutter_speed', 'flash', 
                       'focal_length', 'lens_model']
datetime_info_headers = ['image_id', 'date_time_original', 'modified_date', 'time_delta']

def write_camera_data(data):
    if os.path.isfile(OUTFILE_CAMERA_INFO):
        read_mode = 'a'
    else:
        read_mode = 'w'

    with open(OUTFILE_CAMERA_INFO, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            writer.writerow(camera_info_headers)
            
        for entry in data:
            values = [entry.get(header) for header in camera_info_headers]
            writer.writerow(values)


def write_datetime_data(data):
    if os.path.isfile(OUTFILE_DATETIME_INFO):
        read_mode = 'a'
    else:
        read_mode = 'w'

    with open(OUTFILE_DATETIME_INFO, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            writer.writerow(datetime_info_headers)
            
        for entry in data:
            values = [entry.get(header) for header in datetime_info_headers]
            writer.writerow(values)


In [7]:
data = get_data(INFILE)
write_camera_data(data)
write_datetime_data(data)

In [8]:
dates_data = pd.read_csv(OUTFILE_DATETIME_INFO)
# print(dates_data)

In [9]:
camera_data = pd.read_csv(OUTFILE_CAMERA_INFO)
# print(camera_data)

In [10]:
csv_headers = ['amenity', 'building', 'tourism', 'place', 'house_number', 'emergency', 'leisure', 'quarter',
              'highway', 'historic', 'man_made', 'natural', 'road', 'farm', 'isolated_dwelling', 'neighbourhood', 
              'residential', 'suburb', 'city_district', 'town', 'locality', 'city', 'hamlet', 'village', 
              'municipality', 'county', 'state_district', 'district', 'province', 'state', 'region', 'postcode', 'country', 
              'country_code']
empty_address_dict = {}
for key in csv_headers:
    empty_address_dict[key] = None

illegal_country_codes = ['eg'] # Arabic scripts not supported
# missing_from_illegal_country_codes = []

# TODO: test this still works with a SMALLER file. the 1000 lie one takes about 10 mins
def write_address_data(data):
    ctr = 0
    if os.path.isfile(OUTFILE_ADDRESSES):
        read_mode = 'w'
    else:
        read_mode = 'a'

    with open(OUTFILE_ADDRESSES, read_mode) as out_file:
        writer = csv.writer(out_file)
        if read_mode == 'w':
            header_row = itertools.chain(['image_id'], csv_headers)
            writer.writerow(header_row)

        for row in data:
            img_id = row['image_id']
            try: 
                # This is where most of the time running this will be taken
                address_data = geolocator.reverse((row['latitude'], row['longitude'])).raw['address']
                this_address_dict = empty_address_dict.copy()
                # Some images don't have country codes
                try:
                    country_code = address_data['country_code']
                except KeyError as key_exc:
                    country_code = None

                if country_code not in illegal_country_codes:
                    # There's probably a better way to do this, but I don't care
                    for key, value in address_data.items():
                        this_address_dict[key] = value
                    out_row = itertools.chain(img_id, this_address_dict.values())
                    print(f"Wrote row img {img_id} to #{ctr}")
                    writer.writerow(out_row)
                else:
                    print(f'Skipping write for image with id,country code {img_id},{country_code}')
                    # missing_from_illegal_country_codes.append(img_id)
            except Exception as e:
                print(f'Write failed for image with id {img_id}')

            ctr+=1

In [11]:
# TODO: test this, see above
write_address_data(data)

Wrote row img 27653319031 to #0
Wrote row img 22697429926 to #1
Wrote row img 6650348617 to #2
Wrote row img 12337695235 to #3
Wrote row img 8432503596 to #4
Wrote row img 15629040035 to #5
Wrote row img 20610126982 to #6
Wrote row img 29359326241 to #7
Wrote row img 14095127199 to #8
Wrote row img 6219236142 to #9
Wrote row img 8310392852 to #10
Wrote row img 23313968663 to #11
Wrote row img 9534473273 to #12
Wrote row img 14086415848 to #13
Wrote row img 5669373138 to #14
Wrote row img 27534225480 to #15
Wrote row img 10486187245 to #16
Wrote row img 14191645299 to #17
Wrote row img 9587382078 to #18
Wrote row img 15035657071 to #19
Wrote row img 4335488443 to #20
Wrote row img 3888632330 to #21
Wrote row img 13844211075 to #22
Wrote row img 14124294348 to #23
Wrote row img 15720949851 to #24
Wrote row img 6132942800 to #25
Wrote row img 20067543468 to #26
Wrote row img 28069344081 to #27
Wrote row img 28168408525 to #28
Wrote row img 6357276861 to #29
Wrote row img 44645439685 to #3

In [None]:
# Here are all the unique keys from geolocator.reverse(s).raw:
#
# place_id
# licence
# osm_type
# osm_id
# lat
# lon
# display_name
# address
# boundingbox

In [None]:
# TODO: once the write_address_data func is tested, this cell can be deleted

# illegal_country_codes = ['eg']
# missing_from_illegal_country_codes = []
# # TODO make this conditional on file existing
# if True:
#     mode = 'w'

# EXIF_DATA_FILE = f'{PROJECT_DIR}/exif_data/exif_data_11_29_first_1000.json'
# OUTPUT_FILE = f'{PROJECT_DIR}/output.csv'
# ctr = 0
# with open(EXIF_DATA_FILE, 'r') as in_file:
#     with open(OUTPUT_FILE, mode) as out_file:
#         writer = csv.writer(out_file)
#         if mode == 'w':
#                 header_row = itertools.chain(['image_id'], csv_headers)
#                 writer.writerow(header_row)  

#         rows = json.load(in_file)
#         for row in rows:
            
#             img_id = row['Image_ID']
#             latitude = row['Latitude']
#             longitude = row['Longitude']
              
#             try:
#                 address_data = geolocator.reverse((latitude, longitude)).raw['address']
#                 this_address_dict = address_dict.copy()
#                 # Some images don't have country codes
#                 try:
#                     country_code = address_data['country_code']
#                 except KeyError as key_exc:
#                     country_code = None

#                 if country_code not in illegal_country_codes:
#                     # There's probably a better way to do this, but I don't care
#                     for key, value in address_data.items():
#                         this_address_dict[key] = value
#                     out_row = itertools.chain([img_id], this_address_dict.values())
#                     print(f"Wrote row img {img_id} to #{ctr}")
#                     writer.writerow(out_row)
#                 else:
#                     print(f'Skipping write for image with id,country code {img_id},{country_code}')
#                     missing_from_illegal_country_codes.append(img_id)
#             except Exception as e:
#                 print(f'Write failed for image with id {img_id}')
#                 # traceback.print_exc()
#                 # exit()

#             ctr+=1

In [None]:
# DO NOT DELETE THIS

# amenity
# building
# tourism
# place (before house_number)
# house_number
# emergency, leisure
# quarter
# man_made, highway, historic, natural (only indication is before road)
# highway (only indicatio is before road)
# natural (only indicator is before road)
# historic comes before road
# road
# farm (after road, before city_district)
# isolated_dwelling (between road and city)
# neighbourhood
# residential (before suburb)
# suburb
# city_district (before town, after suburb)
# town
# locality
# city
# hamlet
# village (sometimes before city, sometimes after)
# municipality
# county
# state_district
# province (before state or region)
# state
# region (after state_district, state - before postcode)
# postcode
# country
# # country_code

# csv_headers = ['amenity', 'building', 'tourism', 'place', 'house_number', 'emergency', 'leisure', 'quarter',
#               'highway', 'historic', 'man_made', 'natural', 'road', 'farm', 'isolated_dwelling', 'neighbourhood', 
#               'residential', 'suburb', 'city_district', 'town', 'locality', 'city', 'hamlet', 'village', 
#               'municipality', 'county', 'state_district', 'district', 'province', 'state', 'region', 'postcode', 'country', 
#               'country_code']