In [120]:
from pprint import *
import json
import pandas as pd
import requests
import datetime

In [121]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

### Get list of images and builds

**Functions**

In [122]:
# Filter images from MCR with a specified tag
def filter_images(filter):
   url = "https://mcr.microsoft.com/v2/_catalog"
   payload = ""
   headers = {}
   response = requests.request("GET", url, headers=headers, data=payload)
   all_images = json.loads(response.text)['repositories']
   # Filter images that have the specified text in the name
   filtered_images = [image for image in all_images if filter in image]

   return filtered_images

# Parse out images to conform to Cognitive Services "Pillars"
def parse_images(images):
   # First one is image
   image = [image.split('/')[1] for image in images]
   # Append 'mcr.microsoft.com/v2' to the beginning of the images - need to remove V2 for image pulls!
   image_url = ['mcr.microsoft.com/v2/' + image for image in images]

   # Create a dataframe with the image name, image version, and image tag
   image_df = pd.DataFrame({'image': image,
                            'image_url': image_url})

   return image_df

# Interrogate MCR for all available tags
def get_tags(image_df):
   # Create a dataframe with the image name, image version, and image tag
   image_df['tags'] = ''
   for index, row in image_df.iterrows():
      url = 'https://' + row['image_url'] + '/tags/list'
      payload = ""
      headers = {}
      response = requests.request("GET", url, headers=headers, data=payload)
      tags = json.loads(response.text)['tags']
      image_df.loc[index, 'tags'] = tags

   return image_df

# Get build dates per tag
def get_build_dates(tags_df):
   
   # Create a dataframe with the image name, image version, and image tag
   tags_df['latest_build_date'] = ''

   for index, row in tags_df.iterrows():
      print("{}/{} | Getting build dates for: {}".format(index+1, len(tags_df), row['image']))

      ##########################################################
      # Compare 'latest' with last 3 to check for discrepancies
      ##########################################################
      # Check if 'latest' exists
      #############################
      # The issue here is they have multiple tags with 'latest' - so we gotta sort between those too
      latest_tags = [tag for tag in row['tags'] if 'latest' in tag]
      latest_tag_build_date = '1900-01-01T00:00:00.700342818Z' # Temp to get started
      for tag in latest_tags:
         temp = get_build_date(row['image_url'], tag)
         # If current tag is newer than the latest, update the latest
         if temp > latest_tag_build_date:
            latest_tag_build_date = temp
            latest_tag = tag
      #############################
      # Check last 3
      #############################
      # To contain tag and build date mappings
      tag_n_build_dates = []
      
      # Number to iterate - in case there's less than 3 tags
      if len(row['tags']) < 3:
         to_iterate = len(row['tags'])-1 # Last 1
      else:
         to_iterate = len(row['tags'])-3 # Last 3
      
      # Iterate over last 3 tags per image and get build dates
      for tag in row['tags'][to_iterate:]:
         build_date = get_build_date(row['image_url'], tag)
         tag_n_build_dates.append({'tag': tag, 'build_date': build_date})
      
      # Find the latest build based on build dates we gathered
      if (tag_n_build_dates[0]['build_date'] != None): # If there's a build date
         latest_builds = max(tag_n_build_dates, key=lambda x: x['build_date'])
      else:
         latest_builds = tag_n_build_dates[-1] # Take alphabetically first one
      ##########################################################
      # Append whichever is actually the most up to date
      ##########################################################
      if (latest_builds['build_date'] != None):
         if (latest_builds['build_date'] > latest_tag_build_date):
            # Append the tag and build date to the dataframe since latest is not actually up to date
            tags_df.loc[index, 'latest_tag'] = latest_builds['tag']
            tags_df.loc[index, 'latest_build_date'] = latest_builds['build_date']
         else:
            # Add the 'latest' tag we found earlier
            tags_df.loc[index, 'latest_tag'] = latest_tag
            tags_df.loc[index, 'latest_build_date'] = latest_tag_build_date
      elif (latest_builds['build_date'] == None):
         tags_df.loc[index, 'latest_tag'] = latest_builds['tag']
         tags_df.loc[index, 'latest_build_date'] = latest_builds['build_date']

   return tags_df

# Get build date for a given tag
def get_build_date(image_url, tag):
   url = 'https://' + image_url + '/manifests/' + tag
   payload = ""
   headers = {"Accept": "application/vnd.oci.image.manifest.v1+json"}
   response = requests.request("GET", url, headers=headers, data=payload)

   # Treat schemaVersion == 2 seperate: https://stackoverflow.com/questions/32605556/how-to-find-the-creation-date-of-an-image-in-a-private-docker-registry-api-v2
   schemaVersion = json.loads(response.text)['schemaVersion']
   if schemaVersion == 2:
      build_date = None # Not available in schemaVersion 2
   elif schemaVersion == 1:
      build_date = json.loads(json.loads(response.text)['history'][0]['v1Compatibility']).get('created')

   return build_date

# Data Pipeline
def processing_pipeline(tag):
   # Generate a list of Arc Data Services images
   images = filter_images(tag)
   # Store images in a dataframe
   image_df = parse_images(images)
   # Get list of tags from MCR
   tags_df = get_tags(image_df)
   # Drops publish dates based on tag ordering
   tags_and_build_dates_df = get_build_dates(tags_df)
   # Drop unnecessary tags column
   tags_and_build_dates_df.drop(columns=['tags'], inplace=True)
   # Sort by latest_build_date DESC
   tags_df.sort_values(by=['latest_build_date'], ascending=False, inplace=True)

   return tags_df

### `arcdata`

In [123]:
tags_df_arcdata = processing_pipeline('arcdata')

1/23 | Getting build dates for: arc-bootstrapper
2/23 | Getting build dates for: arc-dns
3/23 | Getting build dates for: arc-postgres-11
4/23 | Getting build dates for: arc-postgres-12
5/23 | Getting build dates for: arc-control-watchdog
6/23 | Getting build dates for: arc-controller
7/23 | Getting build dates for: arc-controller-db
8/23 | Getting build dates for: arc-sqlmi
9/23 | Getting build dates for: arc-monitor-collectd
10/23 | Getting build dates for: arc-monitor-elasticsearch
11/23 | Getting build dates for: arc-monitor-fluentbit
12/23 | Getting build dates for: arc-kafka
13/23 | Getting build dates for: arc-monitor-grafana
14/23 | Getting build dates for: arc-monitor-influxdb
15/23 | Getting build dates for: arc-monitor-kibana
16/23 | Getting build dates for: arc-monitor-telegraf
17/23 | Getting build dates for: arc-server-controller
18/23 | Getting build dates for: arc-service-proxy
19/23 | Getting build dates for: arcdataservices-extension
20/23 | Getting build dates for: ms

In [124]:
# Filter tags_df_arcdata for build dates greater than 2021-07-29T17:30:00.75187855Z (empirically determined) - after this we seem to get images that are kept up to date
# Also include None values
useful_images_arcdata = tags_df_arcdata[(tags_df_arcdata['latest_build_date'] > '2021-07-30T17:30:00.75187855Z') | (tags_df_arcdata['latest_build_date'].isnull())]
useless_images_arcdata = tags_df_arcdata[tags_df_arcdata['latest_build_date'] <= '2021-07-30T17:30:00.75187855Z']

# Reset index
useful_images_arcdata.reset_index(drop=True, inplace=True)
useless_images_arcdata.reset_index(drop=True, inplace=True)

In [125]:
display(useful_images_arcdata)

Unnamed: 0,image,image_url,latest_build_date,latest_tag
0,arc-sqlmi,mcr.microsoft.com/v2/arcdata/arc-sqlmi,2022-02-25T04:07:26.378287312Z,v1.4.0_2022-02-25
1,arc-postgres-12,mcr.microsoft.com/v2/arcdata/arc-postgres-12,2022-02-25T04:06:45.679788739Z,v1.4.0_2022-02-25
2,arc-postgres-11,mcr.microsoft.com/v2/arcdata/arc-postgres-11,2022-02-25T04:05:44.388850447Z,v1.4.0_2022-02-25
3,arc-kafka,mcr.microsoft.com/v2/arcdata/arc-kafka,2022-02-25T04:03:46.431175098Z,v1.4.0_2022-02-25
4,arc-controller-db,mcr.microsoft.com/v2/arcdata/arc-controller-db,2022-02-25T04:01:48.830315511Z,v1.4.0_2022-02-25
5,arc-controller,mcr.microsoft.com/v2/arcdata/arc-controller,2022-02-25T04:01:02.166837292Z,v1.4.0_2022-02-25
6,arc-bootstrapper,mcr.microsoft.com/v2/arcdata/arc-bootstrapper,2022-02-25T03:59:43.949204601Z,v1.4.0_2022-02-25
7,arc-ha-supervisor,mcr.microsoft.com/v2/arcdata/arc-ha-supervisor,2022-02-25T03:58:48.583299816Z,v1.4.0_2022-02-25
8,arc-ha-orchestrator,mcr.microsoft.com/v2/arcdata/arc-ha-orchestrator,2022-02-25T03:58:41.097594263Z,v1.4.0_2022-02-25
9,arc-dns,mcr.microsoft.com/v2/arcdata/arc-dns,2022-02-25T03:58:12.84054861Z,v1.4.0_2022-02-25


In [126]:
display(useless_images_arcdata)

Unnamed: 0,image,image_url,latest_build_date,latest_tag
0,arc-ha-operator,mcr.microsoft.com/v2/arcdata/arc-ha-operator,2021-07-29T17:30:48.75187855Z,v1.0.0_2021-07-30
1,arc-server-controller,mcr.microsoft.com/v2/arcdata/arc-server-controller,2021-05-28T01:36:29.11422795Z,latest
2,arc-control-watchdog,mcr.microsoft.com/v2/arcdata/arc-control-watchdog,2021-04-26T21:41:27.245453254Z,latest
3,mssql-ha-operator,mcr.microsoft.com/v2/arcdata/mssql-ha-operator,2021-02-26T23:03:10.355773453Z,latest


### `azurearck8s`

In [127]:
tags_df_azurearck8s = processing_pipeline('azurearck8s')

1/42 | Getting build dates for: batch1
2/42 | Getting build dates for: batch1
3/42 | Getting build dates for: batch1
4/42 | Getting build dates for: batch2
5/42 | Getting build dates for: batch2
6/42 | Getting build dates for: westcentralus
7/42 | Getting build dates for: canary
8/42 | Getting build dates for: canary
9/42 | Getting build dates for: fairfax
10/42 | Getting build dates for: batch1
11/42 | Getting build dates for: batch1
12/42 | Getting build dates for: batch1
13/42 | Getting build dates for: batch2
14/42 | Getting build dates for: batch2
15/42 | Getting build dates for: westcentralus
16/42 | Getting build dates for: canary
17/42 | Getting build dates for: canary
18/42 | Getting build dates for: fairfax
19/42 | Getting build dates for: agent-update-job
20/42 | Getting build dates for: cluster-metadata-operator
21/42 | Getting build dates for: cluster-identity
22/42 | Getting build dates for: config-agent
23/42 | Getting build dates for: kube-aad-proxy
24/42 | Getting buil

In [128]:
display(tags_df_azurearck8s)

Unnamed: 0,image,image_url,latest_build_date,latest_tag
39,guardcontroller,mcr.microsoft.com/v2/azurearck8s/guardcontroller,2022-02-19T00:14:51.641241689Z,1.6.7-preview
40,guardinit-container,mcr.microsoft.com/v2/azurearck8s/guardinit-container,2022-02-19T00:14:08.287930665Z,1.6.7-preview
41,clusterconnectservice-operator,mcr.microsoft.com/v2/azurearck8s/clusterconnectservice-operator,2022-02-19T00:13:21.383104492Z,1.6.7-preview
31,extensionoperator,mcr.microsoft.com/v2/azurearck8s/extensionoperator,2022-02-19T00:12:35.975289248Z,1.6.7-preview
28,flux-logs-agent,mcr.microsoft.com/v2/azurearck8s/flux-logs-agent,2022-02-19T00:11:47.636965214Z,1.6.7-preview
19,cluster-metadata-operator,mcr.microsoft.com/v2/azurearck8s/cluster-metadata-operator,2022-02-19T00:10:58.346762646Z,1.6.7-preview
30,resource-sync,mcr.microsoft.com/v2/azurearck8s/resource-sync,2022-02-19T00:10:10.357184805Z,1.6.7-preview
20,cluster-identity,mcr.microsoft.com/v2/azurearck8s/cluster-identity,2022-02-19T00:09:18.23035242Z,1.6.7-preview
23,configoperator,mcr.microsoft.com/v2/azurearck8s/configoperator,2022-02-19T00:08:20.106075907Z,1.6.7-preview
24,connect-agent,mcr.microsoft.com/v2/azurearck8s/connect-agent,2022-02-19T00:07:31.228704394Z,1.6.7-preview


In [129]:
# Filter tags_df_azurearck8s for:
# - image != arc-preview, 
useless_images_azurearck8s = tags_df_azurearck8s[(tags_df_azurearck8s['image'].isin(['arc-preview', 'aks', 'batch1', 'batch2', 'westcentralus', 'canary', 'fairfax']))]

# Subtract out
useful_images_azurearck8s = pd.concat([tags_df_azurearck8s, useless_images_azurearck8s, useless_images_azurearck8s]).drop_duplicates(keep=False)

# Reset index
useful_images_azurearck8s.reset_index(drop=True, inplace=True)
useless_images_azurearck8s.reset_index(drop=True, inplace=True)

In [130]:
display(useless_images_azurearck8s)

Unnamed: 0,image,image_url,latest_build_date,latest_tag
0,aks,mcr.microsoft.com/v2/azurearck8s/aks/stable/extensionoperator,2022-02-18T22:22:31.882096229Z,1.6.6
1,aks,mcr.microsoft.com/v2/azurearck8s/aks/stable/configoperator,2022-02-18T22:19:21.91695387Z,1.6.6
2,aks,mcr.microsoft.com/v2/azurearck8s/aks/stable/config-agent,2022-02-18T22:18:05.920831492Z,1.6.6
3,aks,mcr.microsoft.com/v2/azurearck8s/aks/stable/fluent-bit,2022-02-18T22:08:30.43140359Z,1.6.6
4,arc-preview,mcr.microsoft.com/v2/azurearck8s/arc-preview/flux-init-container,2020-09-30T00:53:51.125512705Z,0.0.1
5,batch1,mcr.microsoft.com/v2/azurearck8s/batch1/stable/azure-arc-k8sagents,,1.6.6
6,batch1,mcr.microsoft.com/v2/azurearck8s/batch1/stable/appliance/azure-arc-k8sagents,,0.4.0-appliance
7,batch1,mcr.microsoft.com/v2/azurearck8s/batch1/preview/azure-arc-k8sagents,,1.6.7-preview
8,batch2,mcr.microsoft.com/v2/azurearck8s/batch2/stable/azure-arc-k8sagents,,1.6.6
9,batch2,mcr.microsoft.com/v2/azurearck8s/batch2/preview/azure-arc-k8sagents,,1.6.7-preview


In [131]:
display(useful_images_azurearck8s)

Unnamed: 0,image,image_url,latest_build_date,latest_tag
0,guardcontroller,mcr.microsoft.com/v2/azurearck8s/guardcontroller,2022-02-19T00:14:51.641241689Z,1.6.7-preview
1,guardinit-container,mcr.microsoft.com/v2/azurearck8s/guardinit-container,2022-02-19T00:14:08.287930665Z,1.6.7-preview
2,clusterconnectservice-operator,mcr.microsoft.com/v2/azurearck8s/clusterconnectservice-operator,2022-02-19T00:13:21.383104492Z,1.6.7-preview
3,extensionoperator,mcr.microsoft.com/v2/azurearck8s/extensionoperator,2022-02-19T00:12:35.975289248Z,1.6.7-preview
4,flux-logs-agent,mcr.microsoft.com/v2/azurearck8s/flux-logs-agent,2022-02-19T00:11:47.636965214Z,1.6.7-preview
5,cluster-metadata-operator,mcr.microsoft.com/v2/azurearck8s/cluster-metadata-operator,2022-02-19T00:10:58.346762646Z,1.6.7-preview
6,resource-sync,mcr.microsoft.com/v2/azurearck8s/resource-sync,2022-02-19T00:10:10.357184805Z,1.6.7-preview
7,cluster-identity,mcr.microsoft.com/v2/azurearck8s/cluster-identity,2022-02-19T00:09:18.23035242Z,1.6.7-preview
8,configoperator,mcr.microsoft.com/v2/azurearck8s/configoperator,2022-02-19T00:08:20.106075907Z,1.6.7-preview
9,connect-agent,mcr.microsoft.com/v2/azurearck8s/connect-agent,2022-02-19T00:07:31.228704394Z,1.6.7-preview


### Generate documentation

In [132]:
# Useful
markdown_arcdata_useful = useful_images_arcdata.to_markdown()
markdown_azurearck8s_useful = useful_images_azurearck8s.to_markdown()

# Useless
markdown_arcdata_useless = useless_images_arcdata.to_markdown()
markdown_azurearck8s_useless = useless_images_azurearck8s.to_markdown()

In [133]:
# Save markdown to a file with current timestamp
timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
filename = "README.md"
with open(filename, "w") as f:
    f.write("# Arc Container Image Refresher\n")
    f.write("**Image dump generated at: `{}`**\n".format(timestamp))
    f.write("## Useful images\n")
    f.write("### Arc Data Services\n")
    f.write(markdown_arcdata_useful)
    f.write("\n---\n")
    f.write("### Arc Kubernetes - AKA \"HAIKU\"\n")
    f.write(markdown_azurearck8s_useful)
    f.write("\n---\n")
    f.write("## Use*less* images\n")
    f.write("There are a bunch of useless images in MCR - these seem to be suspect:\n")
    f.write("### Arc Data Services\n")
    f.write(markdown_arcdata_useless)
    f.write("\n---\n")
    f.write("### Arc Kubernetes - AKA \"HAIKU\"\n")
    f.write(markdown_azurearck8s_useless)
    f.close()