In [1]:
def save_to_dict(record, dict, field_list):
    if not record is None:
        try:
            # Iterace skrz tuples v seznamu field_list
            for field_tags in field_list:
                # Nazev klice ve slovniku
                dict_key_name =  field_tags[0]

                # Tag pole
                tag =  field_tags[1]

                # Tag podpole
                subfield_tag =  field_tags[2]
                
                # Seznam do ktereho pridame hodnoty a nasledne pridame do slovniku
                dict_add_list = []
                
                # Iterace pres vsechna pole s tagem 'tag'
                for field in record.get_fields(tag):
                    
                    # Pokud pole nema zadna podpole, pridame cele pole do listu dict_add_list
                    if subfield_tag is None:
                        dict_add_list.append(str(field))
                    
                    # Pokud subtag je instance slice, tedy to znamena, ze chceme jen nejakou cast pole, ktera neni definovana subpolem,
                    # pridame cast pole do slovniku dict_add_list    
                    elif isinstance(subfield_tag, slice):
                        dict_add_list.append(str(field) [subfield_tag])     
                    
                    # Pokud pole obsahuje podpole, pridame do slovniku dict_add_list jen podpole
                    elif '$'+subfield_tag in str(field):  
                        dict_add_list.append(str(field[subfield_tag]))

                # Do klice z tuplu pridame cely seznam dict_add_list         
                dict[dict_key_name].append(dict_add_list)
        except Exception as error:
            print("Exception: " + type(error).__name__)  
            print("964 Field: " + str(record.get_fields('964')))  
            print("LDR: " + str(record.leader))   
    return dict 

In [2]:
import pandas as pd
import re 
from pymarc import MARCReader

# 'data/csv/ucla_B.csv'
# 'data/csv/ucla_ret.csv'
# 'data/csv/ucla_smz.csv'
# 'data/csv/ucla_int.csv'
# 'data/csv/ucla_cle.csv'
# 'data/csv/ucla_trl.csv'

# Cesta k marcovemu dokumentu
database = 'data/ucla/ucla_B.mrc'

# Z cesty vytahneme typ databaze
pattern = r"data/ucla/ucla_(.*?)\.mrc"

# Find the substring using regex
database_type = re.search(pattern, database).group(1)

out = 'data/csv/out_{}.csv'.format(database_type)

with open(database, 'rb') as data:
    reader = MARCReader(data)
    # Seznam poli, ktere si chceme ulozit
    field_list = [('title', '245', 'a'),
                ('author', '100', 'a'),
                ('author code', '100', '7'),
                # Rok je schovany v poli 008 na 13. az 16. miste, 
                # proto vyuzijeme funkci slice
                ('year', '008', slice(13,17, None)),
                ('figures', '600', 'a'),
                ('description', '650', 'a'),
                ('genre', '655', 'a'),
                ('magazine', '773', 't')]
    dict = {}
    for t in field_list:
        dict_key_name = t[0]
        dict[dict_key_name] = []
    for record in reader:
        dict = save_to_dict(record, dict, field_list)
    df = pd.DataFrame.from_dict(dict)

    # U jmen si chceme ulozit jmeno a prijmeni bez koncove carky ',', ktera je na konci stringu
    df['figures'] = df['figures'].apply(lambda x: [y[:y.rfind(',')] if isinstance(y, str) and len(y) > 0 else y for y in x]) 
    df['author'] = df['author'].apply(lambda x: [y[:y.rfind(',')] if isinstance(y, str) and len(y) > 0 else y for y in x])  

    # Aby se nam list hodnot lepe ukladal, vytvorime z listu jeden string a jednotlive elementy spojime strednikem ';' 
    for column in df.columns:
        df[column] = df[column].apply(lambda x: ';'.join(x))
    df.to_csv(out, encoding = 'utf8', sep = ",") 

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np

out = 'data/csv/out_{}.csv'.format(database_type)

# Cesta k nasim datum
csv_data = out

# Nacteni dat
df = pd.read_csv(csv_data, delimiter=',')
# Odstraneni zbytecneho sloupce
df = df.drop(['Unnamed: 0'], axis = 1)

for column in df.columns:
    if df[column].dtype != 'int64':   
        # Hodnoty spojene v jeden string zpatky rozdelime do listu, aby se nam s nim lepe pracovalo
        df[column] = df[column].apply(lambda x: x.split(';') if isinstance(x, str)  else [])


  df = pd.read_csv(csv_data, delimiter=',')


In [7]:
ind_Kundera = [True if any([True if author == 'Kundera, Milan' else False for author in author_list]) else False for author_list in df['figures']]
df_Kundera = df[ind_Kundera]


In [8]:
import re
from collections import Counter

def flatten_list(strings):
    flattened_list = []
    for item in strings:
        if isinstance(item, str):  # If item is a string, add it directly to the flattened list
            flattened_list.append(item)
        else:  # If item is a list, recursively flatten it
            flattened_list.extend(flatten_list(item))
    return flattened_list


In [14]:
all_genre = flatten_list(df_Kundera.genre)
counted_genre = Counter(all_genre)

ten_most_common_genre = [item[0] for item in counted_genre.most_common(10)]

# Count records per genre
genre_counts = {}

# Count total articles per year
total_counts = {}

# Iterate over each row in the DataFrame
for _, row in df_Kundera.iterrows():
    for year in row['year']: 
        print(year)
        genre_list = row['genre']

        # Increment the total count for the current year
        total_counts[year] = total_counts.get(year, 0) + 1
    
        # Iterate over each genre in the genres list for the current row
        for genre in genre_list:
            # Increment the count for the current genre and year
            genre_year = (genre, year)
            genre_counts[genre_year] = genre_counts.get(genre_year, 0) + 1

# Extract unique genres and years
genres = list(set([topic for topic, _ in genre_counts.keys()]))
years = list(set([year for _, year in genre_counts.keys()]))

# Sort the genres and years in ascending order
genres.sort()
years.sort()

2012
1957
2012
1957
1960
1959
2012
1953
2012
2012
1957
1960
2012
2012
1955
2012
1960
1957
2012
1957
2011
1957
1957
2012
2012
2012
2012
2012
2012
1957
2012
2012
2010
1992
2012
2009
2012
1953
1960
2012
1960
2012
1956
1959
1960
2012
1956
1956
1957
1955
1955
2012
2007
1958
1958
2013
2013
1953
2013
1960
1953
1953
1960
1953
2012
2013
1947
1956
1960
2013
1954
1957
2013
1954
2013
2013
2013
2013
1957
2012
1955
1960
2013
1957
1957
1960
1955
2011
2013
1955
1955
2013
2013
2013
1955
2013
1955
2013
2013
2013
2013
1953
2013
2013
2013
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1998
1998
1998
1998
1998
1998
1998
1997
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1997
1998
1997
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1997


In [None]:
import matplotlib.pyplot as plt

# Create a line chart
plt1 = plt.figure(figsize=(10, 6))  # Set the figure size (optional)

# Plot the lines for each topic
for row in df_Kundera.iterrows():
    for figure in row['figure']:
        counts = [genre_counts.get((figure, year), 0) for year in years]
        plt1 = plt.plot(years, counts, label=figure)

# Add labels and title
plt1 = plt.xlabel('Year')
plt1 = plt.ylabel('Article Count')
plt1 = plt.title('Genre Popularity Over the Years')

# Add a legend
plt1 = plt.legend()  

# Display the chart
plt.show()


TypeError: unhashable type: 'Series'