## From cluster extraction to cartography

In [1]:
import pydeck as pdk
import pandas as pd
from shapely.wkt import loads
import numpy as np

### Load the cluster information

In [12]:
df = pd.read_csv('/Users/carboni/Downloads/corpus-112_images_2023-09-21_1139.csv', low_memory=False)

### Eliminate NaN from the column WKT and the column City

In [13]:
df = df.dropna(subset=['City'])

In [14]:
df['wkt'] = df['wkt'].astype(str)

In [15]:
not_contains_point = df['wkt'].apply(lambda x: 'POINT(' not in x)

In [16]:
result = df[not_contains_point]

In [17]:
result.to_csv('nan_wkt.csv', index=False)

In [18]:
df = df[~df['wkt'].str.contains('nan')]

### Count the number of cities

In [9]:
#df = df.drop(columns=['manifest_url', 'canvas_number','notice'])

In [10]:
#df = df.drop(columns=['image_url'])

In [19]:
df['city_number'] = df.groupby(['City', 'Country'])['City'].transform('count')

In [None]:
# too heavy
#df['Titles_in_City'] = df.groupby('City')['Title'].transform(lambda x: ','.join(x.unique()))

In [20]:
df.head()

Unnamed: 0,manifest_url,canvas_number,image_url,City,Country,Title,wkt,Date,Journal Type,notice,city_number
1,https://iiif.unige.ch/dhportal/ug8096214/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8064419/...,Tokyo,Japan,FLIX,POINT(139.69222222222 35.689722222222),2010-06-01,Cinema,,5866
2,https://iiif.unige.ch/dhportal/ug8054362/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8059238/...,Tokyo,Japan,FLIX,POINT(139.69222222222 35.689722222222),2013-08-01,Cinema,,5866
3,https://iiif.unige.ch/dhportal/ug8079451/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8102867/...,Kyiv,Ukraine,KINO,POINT(30.523611111111 50.45),1933-05-10,Cinema,,404
4,https://iiif.unige.ch/dhportal/ug8043206/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8068266/...,Moscow,Russia,Brigada khudozhnikov,POINT(37.617777777778 55.755833333333),,,,1359
5,https://iiif.unige.ch/dhportal/ug410612/manifest,0.0,https://iiif.unige.ch/iiif/2/visualcontagions/...,Hamburg,Germany,Der Spiegel,POINT(10 53.55),2020-10-26,,,6840


### Check if there are some decimal coordinates

In [21]:
df1 = df[~df['wkt'].str.contains('POINT')]

In [22]:
df1.head()

Unnamed: 0,manifest_url,canvas_number,image_url,City,Country,Title,wkt,Date,Journal Type,notice,city_number


In [None]:
df1.info()

#### If there are coordinates in decimal, use the 2 cells below

In [None]:
def convert_decimal_to_wkt(value):
    try:
        lat, lon = value.split(',')
        return f'POINT({lat} {lon})'
    except:
        return value

In [None]:
df['wkt'] = df['wkt'].apply(convert_decimal_to_wkt)

### Transform WKT into geometry and extract coordinates

In [23]:
df['geometry'] = df['wkt'].apply(loads)

In [24]:
df['latitude'] = df['geometry'].apply(lambda geom: geom.y)
df['longitude'] = df['geometry'].apply(lambda geom: geom.x)

In [25]:
df.head()

Unnamed: 0,manifest_url,canvas_number,image_url,City,Country,Title,wkt,Date,Journal Type,notice,city_number,geometry,latitude,longitude
1,https://iiif.unige.ch/dhportal/ug8096214/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8064419/...,Tokyo,Japan,FLIX,POINT(139.69222222222 35.689722222222),2010-06-01,Cinema,,5866,POINT (139.69222222222 35.689722222222),35.689722,139.692222
2,https://iiif.unige.ch/dhportal/ug8054362/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8059238/...,Tokyo,Japan,FLIX,POINT(139.69222222222 35.689722222222),2013-08-01,Cinema,,5866,POINT (139.69222222222 35.689722222222),35.689722,139.692222
3,https://iiif.unige.ch/dhportal/ug8079451/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8102867/...,Kyiv,Ukraine,KINO,POINT(30.523611111111 50.45),1933-05-10,Cinema,,404,POINT (30.523611111111 50.45),50.45,30.523611
4,https://iiif.unige.ch/dhportal/ug8043206/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8068266/...,Moscow,Russia,Brigada khudozhnikov,POINT(37.617777777778 55.755833333333),,,,1359,POINT (37.617777777778 55.755833333333),55.755833,37.617778
5,https://iiif.unige.ch/dhportal/ug410612/manifest,0.0,https://iiif.unige.ch/iiif/2/visualcontagions/...,Hamburg,Germany,Der Spiegel,POINT(10 53.55),2020-10-26,,,6840,POINT (10 53.55),53.55,10.0


In [26]:
df_no_duplicates = df.drop_duplicates(subset=['City', 'Country'])

In [28]:
df_no_duplicates = df_no_duplicates.drop(columns=['geometry', 'Title', 'wkt', 'Journal Type', 'Date'])

In [30]:
df_no_duplicates = df_no_duplicates.reset_index(drop=True)

In [31]:
df_no_duplicates.head()

Unnamed: 0,manifest_url,canvas_number,image_url,City,Country,notice,city_number,latitude,longitude
0,https://iiif.unige.ch/dhportal/ug8096214/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8064419/...,Tokyo,Japan,,5866,35.689722,139.692222
1,https://iiif.unige.ch/dhportal/ug8079451/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8102867/...,Kyiv,Ukraine,,404,50.45,30.523611
2,https://iiif.unige.ch/dhportal/ug8043206/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8068266/...,Moscow,Russia,,1359,55.755833,37.617778
3,https://iiif.unige.ch/dhportal/ug410612/manifest,0.0,https://iiif.unige.ch/iiif/2/visualcontagions/...,Hamburg,Germany,,6840,53.55,10.0
4,https://iiif.archivelab.org/iiif/MovieStarsPar...,,https://iiif.archivelab.org/iiif/MovieStarsPar...,New York City,United States of America,,405445,40.7,-74.0


In [33]:
df_no_duplicates['normalized_counts'] = np.log(df_no_duplicates['city_number'])

In [34]:
df_no_duplicates.head()

Unnamed: 0,manifest_url,canvas_number,image_url,City,Country,notice,city_number,latitude,longitude,normalized_counts
0,https://iiif.unige.ch/dhportal/ug8096214/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8064419/...,Tokyo,Japan,,5866,35.689722,139.692222,8.676928
1,https://iiif.unige.ch/dhportal/ug8079451/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8102867/...,Kyiv,Ukraine,,404,50.45,30.523611,6.001415
2,https://iiif.unige.ch/dhportal/ug8043206/manifest,0.0,https://iiif.unige.ch/iiif/2/fedora_ug8068266/...,Moscow,Russia,,1359,55.755833,37.617778,7.214504
3,https://iiif.unige.ch/dhportal/ug410612/manifest,0.0,https://iiif.unige.ch/iiif/2/visualcontagions/...,Hamburg,Germany,,6840,53.55,10.0,8.830543
4,https://iiif.archivelab.org/iiif/MovieStarsPar...,,https://iiif.archivelab.org/iiif/MovieStarsPar...,New York City,United States of America,,405445,40.7,-74.0,12.912741


### Create and save the map

In [35]:
scatter_layer = pdk.Layer(
    'ScatterplotLayer',
    df_no_duplicates,
    opacity=0.6,
    get_position='[longitude, latitude]',
    get_radius='normalized_counts * 5000',
    get_fill_color=[255, 0, 0],  # Red 
    pickable=True,
    stroked=True,
    get_line_color=[255,255,255]
)

In [36]:
view_state = pdk.ViewState(
    latitude=df_no_duplicates['latitude'].mean(),
    longitude=df_no_duplicates['longitude'].mean(),
    zoom=3,
)

#### here you can modify the content of the the tooltip for the html map

In [37]:
tooltip = {
    "html": "<b>{city_number}</b> of images published in <b>{City}</b>",
    "style": {"background": "grey", "color": "white", "font-family": '"Helvetica Neue", Arial', "z-index": "10000"},
}

In [38]:
deck = pdk.Deck(
    layers=[scatter_layer],
    initial_view_state=view_state,
    tooltip=tooltip,
    map_provider="carto",
    map_style="light" #possible here to go for light’, ‘dark’, ‘road’, ‘satellite’, 
    #‘dark_no_labels’, and ‘light_no_labels’. Also possible to use mapbox. To change together with the 
    #parameters on scatter_layer (e.g. opacity!)
)

In [41]:
deck.to_html(filename='map.html', offline=True, open_browser=False, notebook_display=False)

### Save the dataframe used for the map

In [40]:
df.to_csv('df.csv', index=False)