In [None]:
#This notebook is for tagging videos based from their JSON files as returned when sent through the Amazon Rekognition
#Image Processing API

#Please refer to the README before attempting to run or alter the notebook

In [None]:
#1.- INITIALISATION KERNEL:

#Import for visualization
%matplotlib inline 

#Essential imports
import pandas as pd
import sys 
import os
import platform
from pathlib import *
import json 
import random
from datetime import datetime
import ntpath
import numpy as np

#--------------------------FUNCTIONS----------------------------------------------------------------
#kernel holding a function to convert json date into a date/time object

def ConvertToDate(strDate):
    convDate = datetime.strptime(strDate,'%a, %d %b %Y %H:%M:%S %Z')
    return convDate


#kernel that will hold the function that chooses tags from labels (atm randomly)

#Function that will tag videos

def GetVideoTag(labels, freqdict):
#  labels is list of lists of joint labels
#  Select least common labels considering their frequency from equal confidence levels

    tag = "" #tag that will be returned_lbl
    for poss_lbl in labels: # loop through lists of lists
        freq = 1000000 #arbitrary high number of frequencies as initial value
        chosen_lbl = "" # looping variable
        for lbl in poss_lbl:
            if lbl in freqdict:
                #choose label with the least frequencies by finding its freq from the dictionary
                if(freqdict[lbl] < freq):
                    freq = freqdict[lbl]
                    chosen_lbl = lbl
        tag = chosen_lbl+" "+tag # append tag
    return tag


#Function to get directory

def GetValidDirectoryPath():
    #Boolean control to get valid path
    valid_path_given = False
    path = ""
    
    #loop until a valid path is given
    while valid_path_given == False:
        attempted_path = input("Please enter the path of your directory containing videos (with forward slashes):")
        #Check if directory exists, if it does exit loop, or loop until a valid directory is found
        if os.path.isdir(attempted_path) == False:
            print("Invalid directory path, please try again.") # If it doesn't exist send error message
               
        else:
            print("Successful Path entered.")
            path = attempted_path
            valid_path_given = True
    return path

#Label processing functions 

#Get label list and sort it in a dictionary keyed by frequency
def wordListToFreqDict(wordlist):
    wordfreq = [wordlist.count(p) for p in wordlist]
    return dict(zip(wordlist,wordfreq))

#Make a function which makes a list sorted by higher frequency labels for visual purposes
def sortFreqDict(freqdict): 
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux


#-------------------------ESSENTIAL VARIABLES-------------------------------------------------------

#Main data frame columns and lists
list_labels = ['Date' , 'Description', 'FileName', 'RequestID']
date_list = []
tag_list =[] 
filename_list = []
requestID_list = []
file_errors = []

#check what system program is being run for file output
current_platform = platform.system()

In [None]:
#2.- CUSTOMISATION KERNEL
#------------------------CUSTOMISABLE VARIABLES--------------------------------------------------------

#LABEL SETTINGS:
#====================================================================================================
#Set a limit of confidence percentage to filter labels
confidence_threshold = 75.0 #75% confidence and up
conf_decimal_points = 5 #aws default is 14, making confidences less accurate may increase tagging performance


#INPUT AND OUTPUT SETTINGS:
#=====================================================================================================
#Get directory path containing video descriptions

#Will prompt user until a valid path is given, comment out and assign variable directly if known
directory_path = GetValidDirectoryPath()

#directory_path = "YOUR PATH HERE" (Forward slahes preferable)

#Output path, please use forward slashes
output_file_name = directory_path+'/final_tags.csv'



In [None]:
#3.- LABEL PROCESSING KERNEL

#Kernel for obtaining label frequency dictionary from dataset

#Set directory for parsing
label_dir = os.fsencode(directory_path)

#String that is going to hold all labels
total_labels = ""

#Go through each file in directory
for enc_file in os.listdir(label_dir):
    #got to decode file first
    curr_file = os.fsdecode(enc_file)
    
    #check for valid extensions
    if curr_file.endswith(".res") or curr_file.endswith(".json"): 
        file_path= directory_path+"/"+curr_file
        with open(file_path) as j:
            first_data = json.load(j)
            
             #obtain label portion of json
            data_lab = first_data['Labels'][:]
            conf_values = set()
    
            #Loop through labels to get confidence values
            #List of dictionaries so nested for loop needed to filter values
            
            for indiv_labels in data_lab:
                for key,value in indiv_labels.items():
                    #append filtered values into a set
                    if key == 'Name':
                        #Add '-' as delimeter for two-word labels e.g. "Airport Terminal" to be properly processed
                        total_labels +=("-"+value)

#make a list of all labels getting split by delimiter                    
listed_strs = total_labels.split("-")

#Make a dictionary of labels keyed by frequency
my_dict = wordListToFreqDict(listed_strs)

#get a sorted list for visualisation of most frequent labels in descending order
sorted_labels = sortFreqDict(my_dict)

#print sorted list
print(sorted_labels)

In [None]:
#4.- MAIN KERNEL

#Set directory for parsing
directory = os.fsencode(directory_path)

#Go through each file in directory
for encoded_file in os.listdir(directory):
    
    #got to decode file first
    current_file = os.fsdecode(encoded_file)
    
    #check for valid extensions
    if current_file.endswith(".res") or current_file.endswith(".json"): 
        str_file_path= directory_path+"/"+current_file
        with open(str_file_path) as f:
            data = json.load(f)
    
            #getting filename to append
            name_ = os.path.basename(current_file)
    
            #obtain date portion of json 
            video_date = (data['ResponseMetadata']['HTTPHeaders'])['date']
            #convert date string into datetime object
            date = ConvertToDate(video_date)
    
            #obtain request id of json
            request_id = (data['ResponseMetadata']['HTTPHeaders'])['x-amzn-requestid']
    
            #obtain label portion of json
            data_labels = data['Labels'][:]
            confidence_values = set()
    
            #Loop through labels to get confidence values
            #List of dictionaries so nested for loop needed to filter values
            for individual_labels in data_labels:
                for key,value in individual_labels.items():
                    #append filtered values into a set
                    if key == 'Confidence' and value >= confidence_threshold:
                        #round to control confidence similarity
                        rounded_value = round(value, conf_decimal_points)
                        confidence_values.add(rounded_value) 
                
    
            #Cluster labels of equal confidence into a list
            label_groups = []
            #Loop through labels section of json file again
            for different_values in confidence_values:
                equal_labels = [] #cluster labels of same confidence
                for labels in data_labels:
                    for key, value in labels.items():
                        if key == 'Confidence':
                            rounded_value = round(value, conf_decimal_points)
                            if rounded_value == different_values:
                                equal_labels.append(labels['Name'])
                        
                label_groups.append(equal_labels) #make a list of lists of labels
                    
            generated_tag = GetVideoTag(label_groups, my_dict) # get a generated tag
            
            #check to see if there are any labels
            if not generated_tag:
                generated_tag = "---NO LABELS DETECTED---"
            
            #Add particular json data into lists
            
            date_list.append(date)
            tag_list.append(generated_tag)
            filename_list.append(name_)
            requestID_list.append(request_id)   
            
        
    else:
        #make error log
        name = os.path.basename(current_file)
        file_errors.append(name)

In [None]:
#5.- DATA PROCESSING AND SUBMISSION KERNEL

#Kernel that wraps up entire info into a data frame

#convert to nparrays for sorting
dates = np.array(date_list)
tags = np.array(tag_list)
filenames = np.array(filename_list)
IDS = np.array(requestID_list)

#Sort dates which is main parameter by indexes
inds = dates.argsort()

#flip dates because its more convenient in descending order
inds = np.flip(inds, axis=0)   

#Assign sorted indexes
dates = dates[inds]
tags = tags[inds]
filenames = filenames[inds]
IDS = IDS[inds]


#Convert back to lists for processing
date_list = np.ndarray.tolist(dates)
tag_list = np.ndarray.tolist(tags)
filename_list = np.ndarray.tolist(filenames)
requestID_list = np.ndarray.tolist(IDS)

#Conjoin tags for placing info on a dataframe
gen_list = [date_list, tag_list, filename_list, requestID_list]
zipped = list(zip(list_labels, gen_list))
df_data = dict(zipped)
final_df = pd.DataFrame(df_data)
final_df.set_index('Date')

#Output error files inside directory
print("FILES UNABLE TO BE PROCESSED DUE TO ERROR OR BECAUSE NOT IN JSON FORMAT: ")
print(file_errors)

#Output sorted dataframe into a csv file into the selected path
final_df.to_csv(output_file_name, index= False)