# Get data from the NFDI Text+ Registry

## Setup (execute only once)

In [None]:
# install all (external) packages:
#%%capture
!pip install jsonpath_ng

# import packages:
from jsonpath_ng import jsonpath
from jsonpath_ng.ext import parse
import requests

from IPython.display import display
from PIL import Image
from io import BytesIO
from IPython.display import display, SVG

import ipywidgets as widgets

import matplotlib.pyplot as plt

import pandas as pd

import plotly.express as px

# some generic functions:

def resolve_jsonpath(self, find_jsonpath, in_source):
    """ resolve jsonpath and return first value """
    jsonpath_result = parse(find_jsonpath).find(in_source)

    if len(jsonpath_result)>0:
        jsonpath_result_return = jsonpath_result[0].value
    else:
        jsonpath_result_return = None # ?
    return jsonpath_result_return

def get_property_value_by_path(data_json, path):
  property_path = path
  property_result = parse(property_path).find(data_json)
  if len(property_result)>0:
    property_result_value = property_result[0].value
  else:
    #print('Cannot find ' + path + ' in data_json!')
    property_result_value = None
  return property_result_value

def send_post_request(url, data, headers=None):
  try:
      response = requests.post(url, json=data, headers=headers)
      print(f"HTTP Status Code: {response.status_code}")  # Print response code
      #print("Raw Response Text:", response.text)  # Debug raw response

      response.raise_for_status()  # Ensure we catch HTTP errors (e.g., 400, 500)
      return response.json()  # Parse JSON response
  except requests.exceptions.RequestException as e:
      print(f"HTTP Request failed: {e}")
      return None


def send_get_request(url, headers=None):
  try:
      response = requests.get(url, timeout=10.0)
      print(f"HTTP Status Code: {response.status_code}")  # Print response code
      #print("Raw Response Text:", response.text)  # Debug raw response

      response.raise_for_status()  # Ensure we catch HTTP errors (e.g., 400, 500)
      return response.json()  # Parse JSON response
  except requests.exceptions.RequestException as e:
      print(f"HTTP Request failed: {e}")
      return None

def input_widget():
  text_field = widgets.Text(
    value='',
    placeholder='Type something...',
    description='Input:',
  )

  button = widgets.Button(description="Submit")

  display(text_field, button)


def url_is_textplus_institution(url):
  # compares and returns the textplus partner institution's (base) url
  textplus_institutions = ["awhamburg.de", "h-da.de", "uni-uwerzburg.de", "leopoldina.org", "maxweberstiftung.de", "saw-leipzig.de", "adw-goe.de", "tu-dresden.de", "uni-due.de", "uni-paderborn.de", "uni-koeln.de", "hs-wismar.de", "uni-wuppertal.de", "fu-berlin.de", "dla-marbach.de", "gwdg.de", "hab.de", "adwmainz.de", "fz-juelich.de", "klassik-stiftung.de", "lmu.de", "uni-bamberg.de", "steinheim-institut.de", "tu-darmstadt.de", "uni-saarland.de", "uni-hamburg.de", "uni-trier.de", "ulb.tu-darmstadt.de", "uni-freiburg.de", "uni-marburg.de", "badw.de", "uni-tuebingen.de", "hadw-bw.de", "bbaw.de", "awk.nrw.de", "haw.uni-heidelberg.de", "awk.nrw", "uni-muenchen.de"]

  for tpi in textplus_institutions:
    if tpi in url:
      return tpi # return first institution found

  return "" # if no institution matches => return empty string

def links2institutions(links):
  institutions = []
  for link in links:
    institution = url_is_textplus_institution(link)
    if institution != "":
      institutions.append(institution)

  return institutions



class InputWidget:
    def __init__(self, value, placeholder, description, button_description):  # Constructor
        self.value = value
        self.placeholder = placeholder
        self.description = description
        self.button_description = button_description

        self.text_field = widgets.Text(
          value=value,
          placeholder=placeholder,
          description=description,
        )

        self.button = widgets.Button(description=button_description)

        display(self.text_field, self.button)

        #self.text_field.observe(self.on_text_change, names='value')

        self.button.on_click(self.on_button_click)

    def on_button_click(self, b):
        print("Final input:", self.text_field.value)  # Access text field value


## Request all editions

### GET all editions

In [None]:
api_url = "https://registry.text-plus.org/api/v1/e/edition/"

headers = {"Content-Type": "application/json"}  # Adjust headers as needed


response_data = send_get_request(api_url)


if response_data is not None:
  print(len(response_data["items"]))

### Put into array / simplify response

In [None]:
editions = []

for item in response_data["items"]:
  title = get_property_value_by_path(item, "$.properties.main_title[?(@['@lang']=='deu')]['@value']")
  if title is None:
      title = get_property_value_by_path(item, "$.properties.main_title[0]['@value']")
  title_pretty = " ".join(title.split())

  id = item["entityId"]

  valid = item["valid"]

  links = []

  institution = ""

  if "links" in item["properties"]:
    for link in item["properties"]["links"]:
      links.append(link["link_value"][0])

  institutions = links2institutions(links)

  if len(institutions) == 0:
    institutions = "" # empty string if none was found
  else:
    institutions = list(dict.fromkeys(institutions)) # deduplicate list & keep order


  properties_count = len(item["properties"])


  editions.append({"id": id, "url": "https://registry.text-plus.org/doc/registry_edition/" + id, "admin": "https://registry.text-plus.org/admin/entity/" + id + "/","title": title_pretty, "properties_count": properties_count, "valid": valid, "links": links, "institutions": institutions})


print("Anzahl Editionen:")
print(len(editions))

## Display editions

In [None]:

json_counts = {}

for edition in editions:
  #print(edition)
  if(edition["valid"] == False):
    json_counts[edition["title"]] = edition["properties_count"]


# Prepare data for the plot
entries = list(json_counts.keys())
counts = list(json_counts.values())

# Create a larger figure to accommodate many entries
plt.figure(figsize=(20, 10))
plt.bar(entries, counts, color='skyblue')
plt.xlabel("Editions")
plt.ylabel("Number of Properties")
plt.title("Comparison of Properties Counts in Editions")
plt.ylim(0, max(counts)+1)
# Rotate x-axis labels by 90 degrees for readability
plt.xticks(rotation=90)
plt.tight_layout()  # Adjust layout so labels are not cut off
plt.show()


print(len(json_counts))


## DataFrame + Some Statistics

### Put it into a DataFrame

In [None]:


# Sample DataFrame
df = pd.DataFrame(editions)


### Choose filters etc.

In [None]:
# Create a mask and apply it to filter the DataFrame
filtered_df = df[df["valid"] == True]
#filtered_df = df[df["valid"] == False]

attribute_name = "properties_count"

### Some basic statistics

In [None]:
# Select the column you're interested in
col = filtered_df[attribute_name]

# Compute basic statistics
minimum = col.min()
maximum = col.max()
average = col.mean()
median = col.median()

print("Minimum:", minimum)
print("Maximum:", maximum)
print("Average:", average)
print("Median:", median)

### unique/distinct values (i.e. number of properties) sorted by frequency DESC

In [None]:
# Get unique values from the index of the value counts
unique_values_list = filtered_df[attribute_name].value_counts().index.tolist()
print(unique_values_list)


### Histogram Visualisation

In [None]:


fig = px.histogram(filtered_df, x=attribute_name, nbins=30, title='Interactive Histogram of Attribute Values')
fig.show()

## Show Editions with...

In [None]:
show_df = df[df["properties_count"] < 10]

In [None]:
# count editions

num_rows = len(show_df)
print("Number of rows:", num_rows)

In [None]:
# print each edition with link to admin entry

for index, row in df.iterrows():
  if(index <= 9):
    print(row["title"])
    print(row["admin"])
    print("---")
  if(num_rows - index <= 10):
    print(row["title"])
    print(row["admin"])
    print("---")


## Editions: not valid

In [None]:
nonvalid_df = df[df["valid"] == False]

In [None]:
num_rows = len(nonvalid_df)
print("Number of rows:", num_rows)

In [None]:
# print each edition with link to admin entry // first 10

for index, row in nonvalid_df.iterrows():
  if(index <= 9):
    print(row["title"])
    print(row["admin"])
    print("---")
  if(num_rows - index <= 10):
    print(row["title"])
    print(row["admin"])
    print("---")

## Download

In [None]:
# implement download / save to storage