In [91]:
import pandas as pd
import numpy as np
import datetime

ALL_TIMESTAMPS_FILE = '../data/raw/image-timestamps.txt'
SELECTED_TIMESTAMPS_FILE = '../data/processed/selected-image-timestamps-noon.txt'


# get first and last day of image capturing
# skip last entry, which was erroneous, have to use
# 'python' engine instead of faster 'c' engine due to skipping
timestamps = pd.read_csv(
    ALL_TIMESTAMPS_FILE,
    names = ['timestamp'],
    skipfooter = 1,
    engine = 'python',
    dtype = {'timestamp': np.int64} )

In [None]:
# get first and last entry
first_timestamp = timestamps.head(1).iloc[0, 0]
last_timestamp = timestamps.tail(1).iloc[0, 0]
print("First timestamp: {}".format(first_timestamp))
print("Last timestamp: {}".format(last_timestamp))

# get dates
first_day = datetime.datetime.fromtimestamp(first_timestamp)
last_day = datetime.datetime.fromtimestamp(last_timestamp)
print("First day: {}".format(first_day))
print("Last day: {}".format(last_day))

# get time range, total amount of days
timedelta = last_day - first_day
total_number_of_days = timedelta.days
print("Total number of days: {}".format(total_number_of_days))

# we want one image for each day at noon
# therefore we search for images each day within a
# range of +/- 5 minutes around noon because image 
# capturing was not scheduled at noon but images
# were captured about every other minute

# get start/end timestamp, first_day/last_day at noon
first_day_noon = first_day.replace(hour = 12, minute = 0, second = 0, microsecond = 0)
last_day_noon = last_day.replace(hour = 12, minute = 0, second = 0, microsecond = 0)

# jump one day each iteration, 3600s*24h
timestamps_noon = np.zeros(total_number_of_days)
for idx, timestamp in enumerate(
    range(int(first_day_noon.timestamp()), int(last_day_noon.timestamp()), 3600*24)):
    
    # get our time frame
    start = timestamp - 60*5
    end = timestamp + 60*5

    time_frame = timestamps.where(
        np.logical_and(
            timestamps['timestamp'] >= start,
            timestamps['timestamp'] <= end) ).dropna()

    # when there is no image within this time range, continue with next day
    if (time_frame.empty):
        continue
        
    # select closest timestamp
    closest_ts_idx = time_frame['timestamp'].sub(timestamp).abs().idxmin()
    # save to list
    timestamps_noon[idx] = time_frame.loc[closest_ts_idx].values[0]

# drop zero values, indicating empty values
timestamps_noon = timestamps_noon[timestamps_noon > 0]

# add to data frame for convenient csv export
timestamps_noon = pd.DataFrame(timestamps_noon, columns = ['timestamp'], dtype = np.int64)
print("Could find images at noon for {} days.".format(len(results)))
display(timestamps_noon)

In [92]:
# write to file
timestamps_noon.to_csv(SELECTED_TIMESTAMPS_FILE, header = False, index = False)

# Select Images

Find images in filesystem and copy to directory

```
$ xargs -a ~/selected-image-timestamps-noon.txt -L 1 -I# find . -name "pic_#.jpg" -exec cp {} selected-images-noon \;
```

When find is not compiled with *exec* option do this in two steps

```
$ xargs -a ~/selected-image-timestamps-noon.txt -L 1 -I# find . -name "pic_#.jpg" > selected-images-path-noon.txt
$ xargs -a selected-images-path-noon.txt cp -t selected-images-noon
```

Selected images have been saved in *selected-images-noon* directory and can be used for further processing.