# DANAM Image Metadata Statistics
This notebook is used along with the scripts clean_json and caption_processing and to give statistics about the current DANAM image metadata.
Queries is done via Pandas Dataframe.

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

from datetime import datetime
from IPython.display import Markdown, display

from scripts.clean_json import clean_json


In [2]:
# read DANAM json export, always replace with the latest 
danam_export = "json\DANAM\Monument_2022-10-24_02-12-03.json"
danam_images = clean_json(danam_export)
danam_df = pd.DataFrame(danam_images)

### Data Output

In [4]:
## Time Period ##
## format: year, month, date
start = datetime(2022, 4, 1)
end = datetime(2022, 9, 30)
display(Markdown(
    "### Activities between {} and {}"
    .format(start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d"))
    ))

# Filter out danam image entries by the given time period above.
query = danam_df
query = query.loc[query['lastModified'] > start]
query = query.loc[query['lastModified'] < end]

total_monuments = set(query['mon_id'])
inscriptions = query.loc[query['filename'].str.contains("_I_")]
photos = query.loc[query['filename'].str.contains("_P_")]
drawings = query.loc[query['filename'].str.contains("_D_")]
historical = query.loc[query['filename'].str.contains("_H_")]

display(Markdown(
"""
|   |   |
|---|---|
| Monuments updated in this time period  | {0}  |
| Inscriptions updated in this time period  | {1}  |
| Photographs updated in this time period  | {2}  |
| Drawings updated in this time period  | {3}  |
| Historical images updated in this time period  | {5}  |

""".format(len(total_monuments),
            inscriptions.shape[0],
            photos.shape[0],
            drawings.shape[0],
            query.shape[0] - inscriptions.shape[0] - photos.shape[0] - drawings.shape[0],
            historical.shape[0]
)
))

### Activities between 2022-04-01 and 2022-09-30


|   |   |
|---|---|
| Monuments updated in this time period  | 202  |
| Inscriptions updated in this time period  | 556  |
| Photographs updated in this time period  | 2371  |
| Drawings updated in this time period  | 463  |
| Historical images updated in this time period  | 268  |



In [6]:
valid_captions = danam_df.loc[danam_df['validCaption']].shape[0]
all_images = danam_df.shape[0]

print("Percentage of images with valid captions: {}".format(valid_captions/all_images))
print("Percentage of images with invalid captions: {}".format(1-(valid_captions/all_images)))


Percentage of images with valid captions: 0.8059075071270193
Percentage of images with invalid captions: 0.1940924928729807
