In [36]:
import dhlab as dh
import requests
import geopy
import folium
import pandas as pd
from geopy.geocoders import Nominatim
import math

In [2]:
# extract geographic elements
def get_geographic(r):
    try:
        title = r.json()["metadata"]["title"].strip()
    except:
        title = ""
    try:
        city = r.json()["metadata"]["geographic"]["city"].strip()
    except:
        city = ""
    try:
        county = r.json()["metadata"]["geographic"]["county"].strip()
    except:
        county = ""

    try:
        location = geolocator.geocode("%s, %s" % (city, county))
        loc = location.raw
    except:
        loc = {}
    
    return (title, city, county, loc)

In [3]:
def create_record(id, metadata):
    row = []
    row.extend([id])
    row.extend(metadata[:3])
    if metadata[3]:
        row.extend([metadata[3]["place_id"], metadata[3]["lat"] , metadata[3]["lon"], metadata[3]["display_name"]])
    else:
        row.extend([None,None,None,None])
    return row

In [4]:
# read newspaper frame
df = pd.read_excel("newspapers.xlsx")
columns = list(df.columns)

In [5]:
# create locator
geolocator = Nominatim(user_agent="newspaper")

# Create corpus, get metadata

In [15]:
word = "trurøken"

In [16]:
corpus = dh.Corpus(doctype="digavis", fulltext=word, order_by="random", limit=1000)

In [17]:
corpus

Unnamed: 0,dhlabid,urn,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype
0,200408431,URN:NBN:no-nb_digavis_den17demai_null_null_193...,den17demai,,,,,Oslo,19310702,1931,,,,,,,digavis
1,200409245,URN:NBN:no-nb_digavis_den17demai_null_null_193...,den17demai,,,,,Oslo,19340825,1934,,,,,,,digavis
2,201846617,URN:NBN:no-nb_digavis_valdres_null_null_199104...,valdres,,,,,Nord-Aurdal,19910419,1991,,,,,,,digavis
3,201838504,URN:NBN:no-nb_digavis_valdres_null_null_194104...,valdres,,,,,Nord-Aurdal,19410405,1941,,,,,,,digavis
4,201965835,URN:NBN:no-nb_digavis_agdertidend_null_null_19...,agdertidend,,,,,Kristiansand,19350606,1935,,,,,,,digavis
5,202879685,URN:NBN:no-nb_digavis_vesttelemarkblad_null_nu...,vesttelemarkblad,,,,,Kviteseid,20180518,2018,,,,,,,digavis
6,203913754,URN:NBN:no-nb_digavis_boblad_null_null_2005050...,boblad,,,,,,20050506,2005,,,,,,,digavis
7,203910700,URN:NBN:no-nb_digavis_firdaposten_null_null_19...,firdaposten,,,,,,19610920,1961,,,,,,,digavis
8,204177900,URN:NBN:no-nb_digavis_grenda_null_null_1961012...,grenda,,,,,,19610123,1961,,,,,,,digavis
9,204177821,URN:NBN:no-nb_digavis_grenda_null_null_1960031...,grenda,,,,,,19600310,1960,,,,,,,digavis


In [18]:
# check for newspapers
for urn in list(corpus.frame.urn):
    prefix = '_'.join(urn.split('_')[0:3])
    if prefix in df["id"].values:
        pass
    else:
        # run code, add to set
        manifest_url = "https://api.nb.no/catalog/v1/items/%s" % (urn)
        r=requests.get(manifest_url)
        metadata = create_record(id=prefix, metadata=get_geographic(r))
        row = pd.DataFrame([metadata], columns=columns)
        df = pd.concat([df, row])
        print(manifest_url)

In [19]:
# export back to Excel
df.to_excel("newspapers.xlsx", index=False)

# Count the number of occurences

In [20]:
counts = corpus.count(words=[word])

In [21]:
corpus.frame["split_urn"] = corpus.frame["urn"].apply(lambda x: '_'.join(x.split("_")[0:3]))

In [22]:
# merge with metadata
corpus_metadata = corpus.frame.merge(df, left_on="split_urn", right_on="id")

In [23]:
# merge with counts
counts = counts.frame.transpose().reset_index()

In [24]:
corpus_count_metadata = corpus_metadata.merge(counts, left_on="dhlabid", right_on="urn")

In [39]:
# create a decade column
corpus_count_metadata["decade"] = corpus_count_metadata["year"].apply(lambda x: math.floor(x/10) * 10)
corpus_count_metadata.groupby(by=["decade"])[[word]].sum().sort_values(by=word, ascending=False)

Unnamed: 0_level_0,tøysut
decade,Unnamed: 1_level_1
1930,3.0
1960,3.0
1940,1.0
1990,1.0
2000,1.0
2010,1.0


In [25]:
corpus_count_metadata.groupby(by=["county"])[[word]].sum().sort_values(by=word, ascending=False)

Unnamed: 0_level_0,tøysut
county,Unnamed: 1_level_1
Vestland,3.0
Oslo,2.0
Innlandet,2.0
Telemark,2.0
Agder,1.0


# Draw map

In [None]:
map_data = corpus_count_metadata.groupby(by=["place_id", "place", "county", "long", "lat"])[[word]].sum().reset_index()

In [None]:
# Ensure the 'latitude' and 'longitude' columns are numeric 
map_data['lat'] = pd.to_numeric(map_data['lat'], errors='coerce')
map_data['long'] = pd.to_numeric(map_data['long'], errors='coerce')
map_data[word] = pd.to_numeric(map_data[word], errors='coerce') 
map_data.dropna(subset=['lat', 'long', word], inplace=True)

In [None]:
m = folium.Map(location=[map_data['lat'].mean(), map_data['long'].mean()], zoom_start=6)

In [None]:
# Add markers to the map
for i, row in map_data.iterrows():
    folium.CircleMarker(location=[row['lat'], row['long']],
                        radius=row[word] / 2, # Adjust the divisor to control the size
                        color='blue',
                        fill=True,
                        fill_color='blue').add_to(m)

In [None]:
m