In [None]:
import dhlab as dh
import requests
import geopy
import folium
import pandas as pd
from geopy.geocoders import Nominatim
import math

In [None]:
# extract geographic elements
def get_geographic(r):
    try:
        title = r.json()["metadata"]["title"].strip()
    except:
        title = ""
    try:
        city = r.json()["metadata"]["geographic"]["city"].strip()
    except:
        city = ""
    try:
        county = r.json()["metadata"]["geographic"]["county"].strip()
    except:
        county = ""

    try:
        location = geolocator.geocode("%s, %s" % (city, county))
        loc = location.raw
    except:
        loc = {}
    
    return (title, city, county, loc)

In [None]:
def create_record(id, metadata):
    row = []
    row.extend([id])
    row.extend(metadata[:3])
    if metadata[3]:
        row.extend([metadata[3]["place_id"], metadata[3]["lat"] , metadata[3]["lon"], metadata[3]["display_name"]])
    else:
        row.extend([None,None,None,None])
    return row

In [None]:
# read newspaper frame
df = pd.read_excel("newspapers.xlsx")
columns = list(df.columns)

In [None]:
# create locator
geolocator = Nominatim(user_agent="newspaper")

# Create corpus, get metadata

In [None]:
word = "trurøken"

In [None]:
corpus = dh.Corpus(doctype="digavis", fulltext=word, order_by="random", limit=1000)

In [None]:
corpus

In [None]:
# check for newspapers
for urn in list(corpus.frame.urn):
    prefix = '_'.join(urn.split('_')[0:3])
    if prefix in df["id"].values:
        pass
    else:
        # run code, add to set
        manifest_url = "https://api.nb.no/catalog/v1/items/%s" % (urn)
        r=requests.get(manifest_url)
        metadata = create_record(id=prefix, metadata=get_geographic(r))
        row = pd.DataFrame([metadata], columns=columns)
        df = pd.concat([df, row])
        print(manifest_url)

In [None]:
# export back to Excel
df.to_excel("newspapers.xlsx", index=False)

# Count the number of occurences

In [None]:
counts = corpus.count(words=[word])

In [None]:
corpus.frame["split_urn"] = corpus.frame["urn"].apply(lambda x: '_'.join(x.split("_")[0:3]))

In [None]:
# merge with metadata
corpus_metadata = corpus.frame.merge(df, left_on="split_urn", right_on="id")

In [None]:
# merge with counts
counts = counts.frame.transpose().reset_index()

In [None]:
corpus_count_metadata = corpus_metadata.merge(counts, left_on="dhlabid", right_on="urn")

In [None]:
# create a decade column
corpus_count_metadata["decade"] = corpus_count_metadata["year"].apply(lambda x: math.floor(x/10) * 10)
decade_sum = corpus_count_metadata.groupby(by=["decade"])[[word]].sum()

In [None]:
decade_sum.plot(kind="bar", rot=0)

In [None]:
corpus_count_metadata.groupby(by=["county"])[[word]].sum().sort_values(by=word, ascending=False)

# Draw map

In [None]:
map_data = corpus_count_metadata.groupby(by=["place_id", "place", "county", "long", "lat"])[[word]].sum().reset_index()

In [None]:
# Ensure the 'latitude' and 'longitude' columns are numeric 
map_data['lat'] = pd.to_numeric(map_data['lat'], errors='coerce')
map_data['long'] = pd.to_numeric(map_data['long'], errors='coerce')
map_data[word] = pd.to_numeric(map_data[word], errors='coerce') 
map_data.dropna(subset=['lat', 'long', word], inplace=True)

In [None]:
m = folium.Map(location=[map_data['lat'].mean(), map_data['long'].mean()], zoom_start=6)

In [None]:
# Add markers to the map
for i, row in map_data.iterrows():
    folium.CircleMarker(location=[row['lat'], row['long']],
                        radius=row[word] / 2, # Adjust the divisor to control the size
                        color='blue',
                        fill=True,
                        fill_color='blue').add_to(m)

In [None]:
m