Funkce ktera nam ulozi hodnoty do kolekce dictionary

In [1]:
def save_to_dict(record, dict, field_list):
    if not record is None:
        try:
            # Iterace skrz tuples v seznamu field_list
            for field_tags in field_list:
                # Nazev klice ve slovniku
                dict_key_name =  field_tags[0]

                # Tag pole
                tag =  field_tags[1]

                # Tag podpole
                subfield_tag =  field_tags[2]
                
                # Seznam do ktereho pridame hodnoty a nasledne pridame do slovniku
                dict_add_list = []
                
                # Iterace pres vsechna pole s tagem 'tag'
                for field in record.get_fields(tag):
                    
                    # Pokud pole nema zadna podpole, pridame cele pole do listu dict_add_list
                    if subfield_tag is None:
                        dict_add_list.append(str(field))
                    
                    # Pokud subtag je instance slice, tedy to znamena, ze chceme jen nejakou cast pole, ktera neni definovana subpolem,
                    # pridame cast pole do slovniku dict_add_list    
                    elif isinstance(subfield_tag, slice):
                        dict_add_list.append(str(field) [subfield_tag])     
                    
                    # Pokud pole obsahuje podpole, pridame do slovniku dict_add_list jen podpole
                    elif '$'+subfield_tag in str(field):  
                        dict_add_list.append(str(field[subfield_tag]))

                # Do klice z tuplu pridame cely seznam dict_add_list         
                dict[dict_key_name].append(dict_add_list)
        except Exception as error:
            print("Exception: " + type(error).__name__)  
            print("964 Field: " + str(record.get_fields('964')))  
            print("LDR: " + str(record.leader))   
    return dict 

## Ulozeni dat

In [2]:
import pandas as pd
from pymarc import MARCReader

# 'data/csv/ucla_B.csv'
# 'data/csv/ucla_ret.csv'
# 'data/csv/ucla_smz.csv'
# 'data/csv/ucla_int.csv'
# 'data/csv/ucla_cle.csv'
# 'data/csv/ucla_trl.csv'

# Cesta k marcovemu dokumentu
database = 'data/ucla/ucla_cle.mrc'

out = 'data/out.csv'

with open(database, 'rb') as data:
    reader = MARCReader(data)
    # Seznam poli, ktere si chceme ulozit
    field_list = [('title', '245', 'a'),
                ('author', '100', 'a'),
                ('author code', '100', '7'),
                # Rok je schovany v poli 008 na 13. az 16. miste, 
                # proto vyuzijeme funkci slice
                ('year', '008', slice(13,17, None)),
                ('figures', '600', 'a'),
                ('description', '650', 'a'),
                ('genre', '655', 'a'),
                ('magazine', '773', 't')]
    dict = {}
    for t in field_list:
        dict_key_name = t[0]
        dict[dict_key_name] = []
    for record in reader:
        dict = save_to_dict(record, dict, field_list)
    df = pd.DataFrame.from_dict(dict)

    # U jmen si chceme ulozit jmeno a prijmeni bez koncove carky ',', ktera je na konci stringu
    df['figures'] = df['figures'].apply(lambda x: [y[:y.rfind(',')] if isinstance(y, str) and len(y) > 0 else y for y in x]) 
    df['author'] = df['author'].apply(lambda x: [y[:y.rfind(',')] if isinstance(y, str) and len(y) > 0 else y for y in x])  

    # Aby se nam list hodnot lepe ukladal, vytvorime z listu jeden string a jednotlive elementy spojime strednikem ';' 
    for column in df.columns:
        df[column] = df[column].apply(lambda x: ';'.join(x))
    df.to_csv(out, encoding = 'utf8', sep = ",")  

In [2]:
import pandas as pd

out = 'data/csv/out_cle.csv'

# Cesta k nasim datum
csv_data = out

# Nacteni dat
df = pd.read_csv(csv_data, delimiter=',')
# Odstraneni zbytecneho sloupce
df = df.drop(['Unnamed: 0'], axis = 1)


for column in df.columns:
    if df[column].dtype != 'int64':   
        # Hodnoty spojene v jeden string zpatky rozdelime do listu, aby se nam s nim lepe pracovalo
        df[column] = df[column].apply(lambda x: x.split(';') if isinstance(x, str)  else [])


In [3]:
# Funkce na "zplosteni" listu -> ze sloupce DataFramu vytvori jeden list se vsemi elementy 
def flatten_list(strings):
    flattened_list = []
    for item in strings:
        if isinstance(item, str):  # Pokud je element string, pridame ho do seznamu
            flattened_list.append(item)
        else:  # Rekurzivne zavola sebe
            flattened_list.extend(flatten_list(item))
    return flattened_list

## Cisteni dat

In [4]:
import re
from collections import Counter
import numpy as np
 
# regex patern, ktery najde substring mezi zavorkami
pattern_cities = r"\((.*?)\)"

# Ulozime mesta do promenne
cities = df['magazine'].apply(lambda x: [re.search(pattern_cities, y).group(1) if re.search(pattern_cities, y) else None for y in x]).tolist().copy()

# Vytvorime zplostely list a odstranime None hodnoty
cities = sum(cities,[])

# Odstranime nenulove hodnoty
cities = list(filter(lambda x: x is not None, cities))

# Odstranime Index on Censorship z elementu "London, Index on Censorship"
cities = list(map(lambda x: 'London' if 'London' in x else x, cities))

# Odstranime 'Obstladen' z Winterthur
cities = list(map(lambda x: 'Winterthur' if 'Obstladen' in x else x, cities))

# Vytvorime dva elementy z New York-Paříž
cities = list(map(lambda x: ['New York','Paříž'] if 'New York-Paříž' in x else x, cities))

# Vytvorime dva elementy z New York-Paříž
cities = list(map(lambda x: ['Ženeva','Middlesex', 'Mnichov'] if 'Ženeva-Middlesex-Mnichov' in x else x, cities))

# Prepiseme Köln-Ehrenfeld na Kolín nad Rýnem 
cities = list(map(lambda x: 'Kolín nad Rýnem' if 'Köln-Ehrenfeld' in x else x, cities))

# Zplostime seznam
cities = flatten_list(cities)

print(Counter(cities))

Counter({'Mnichov': 2197, 'New York': 2090, 'Řím': 1484, 'Paříž': 928, 'Winterthur': 511, 'Hamburg': 428, 'Curych': 273, 'Toronto': 196, 'Rotterdam': 185, 'Wuppertal': 135, 'Kolín nad Rýnem': 120, 'Vídeň': 68, 'St. Gallen': 65, 'Ženeva': 53, 'Middlesex': 53, 'Londýn': 48, 'Edmonton': 39, 'Scheinfeld': 36, 'Norman, Oklahoma': 22, 'London': 17})


In [7]:
cities_number_of_records = Counter(cities)
cities_df = pd.DataFrame.from_dict(cities_number_of_records, orient='index').reset_index()
cities_df.columns = ['city', 'number of records']
#cities_df['latitude'] = None
#cities_df['longitude'] = None
#df.reindex(columns=['city', 'number of records', 'lat', 'lon'], fill_value=0)

print(cities_df)

                city  number of records
0             London                 17
1         Scheinfeld                 36
2            Mnichov               2197
3    Kolín nad Rýnem                120
4           New York               2090
5                Řím               1484
6             Curych                273
7         St. Gallen                 65
8              Vídeň                 68
9          Wuppertal                135
10            Londýn                 48
11           Hamburg                428
12  Norman, Oklahoma                 22
13            Ženeva                 53
14         Middlesex                 53
15             Paříž                928
16          Edmonton                 39
17           Toronto                196
18        Winterthur                511
19         Rotterdam                185


In [7]:
import requests

def get_city_coordinates(city):
    api_key = "MY KEY"
    url = f"https://api.opencagedata.com/geocode/v1/json?q={city}&key={api_key}"
    
    response = requests.get(url)
    data = response.json()
    
    if response.status_code == 200:
        if data["total_results"] > 0:
            lat = data["results"][0]["geometry"]["lat"]
            lon = data["results"][0]["geometry"]["lng"]
            return lat, lon
        else:
            print("No results found for the city.")
    else:
        print("Error occurred while fetching data.")

In [8]:
unique_cities = set(cities)
coordinates = {}
for city in unique_cities:
    try:
        (latitude, longitude) = get_city_coordinates(city)
        print(f"Coordinates of {city}: Latitude={latitude}, Longitude={longitude}")
        coordinates[city] = (latitude, longitude)
        cities_df.loc[cities_df['city'] == city, 'latitude'] = latitude
        cities_df.loc[cities_df['city'] == city, 'longitude'] = longitude
    except:
        print(f"City {city} not found.")  

df_coordinates = pd.DataFrame.from_dict(coordinates)
df_coordinates.to_csv('data/coordinates.csv')

Coordinates of Kolín nad Rýnem: Latitude=50.938361, Longitude=6.959974
Coordinates of Ženeva: Latitude=46.2017559, Longitude=6.1466014
Coordinates of Mnichov: Latitude=48.1371079, Longitude=11.5753822
Coordinates of Rotterdam: Latitude=51.9244424, Longitude=4.47775
Coordinates of St. Gallen: Latitude=47.425618, Longitude=9.3762397
Coordinates of Middlesex: Latitude=51.5460558, Longitude=-0.2537791
Coordinates of Toronto: Latitude=43.6534817, Longitude=-79.3839347
Coordinates of Londýn: Latitude=51.5073359, Longitude=-0.12765
Coordinates of Edmonton: Latitude=53.5462055, Longitude=-113.491241
Coordinates of Winterthur: Latitude=47.4991723, Longitude=8.7291498
Coordinates of Curych: Latitude=47.3744489, Longitude=8.5410422
Coordinates of London: Latitude=51.5073359, Longitude=-0.12765
Coordinates of Paříž: Latitude=48.8588897, Longitude=2.320041
Coordinates of Wuppertal: Latitude=51.264018, Longitude=7.1780374
Coordinates of Hamburg: Latitude=53.550341, Longitude=10.000654
Coordinates of

In [13]:
coordinates = pd.read_csv('data/coordinates.csv')
coordinates = coordinates.T
coordinates.columns = ['latitude', 'longitude']
coordinates['city'] = coordinates.index
df = pd.merge(cities_df, coordinates)

In [21]:
%pip install altair
%pip install vl-convert-python
%pip install altair_saver
import altair as alt
alt.data_transformers.disable_max_rows()

url = "https://raw.githubusercontent.com/deldersveld/topojson/master/world-continents.json"
source = alt.topo_feature(url, "continent")


base = alt.Chart(source).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('mercator').properties(
    width=800,
    height=600
)

points = alt.Chart(df).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.Size('number of records:Q', title='pocet clanku'),
    tooltip='city:N',
    color=alt.value('red')
)
base+points
# to_save = base+points
# to_save.save('plots/map.png')

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip
