# Text analysis on book titles of the Italian-published ancient books colleted in Leuven

Since the books in the dataset are of different languages, this analysis covers book titles of the two majority of book languages, Latin (over 76%) and Italian (over 20%). 

There are two outputs generated for each language:
1. Frequency list of book titles (visualised in Tableau Public)
2. Collocation bigram list of book titles

## 0. Common preparatory work

In [1]:
# impotrt pandas for reading the tabular data
import pandas as pd 

# import language processing modules for tokenization
# NLTK module for processing Italian texts
from nltk.tokenize import sent_tokenize, word_tokenize
# CLTK(The Classical Language Toolkit)) for processing Latin texts
from cltk.tokenize.word import WordTokenizer
word_tokenizerlat = WordTokenizer('latin')

In [2]:
# generate a data frame of the dataset
df = pd.read_csv(
    "C:/Users/dawn/Desktop/update_final-version-Italy-1501-1600-xml-csv-csv.csv")

In [3]:
# exclude the records wiht "NaN" value in "Title" column
mask = [isinstance(item, (str, bytes)) for item in df['245 $a - Title (NR)']]
df = df.loc[mask]

## 1. Text analysis of book titles in Latin

In [4]:
# extract records with Latin as book language
dflat = df.loc[df['041 $a - Language code of text/sound track or separate title (R)'] == 'Latin' ]

# extract the values of book titles in Latin from the table
titletextlat = dflat['245 $a - Title (NR)']
titletextlat

3       Pavli Iovii ... De vita Leonis Decimi Pont. Ma...
15                             De Christi passione oratio
16      Psalmorvm omnivm ivxta Hebraicam ueritatem par...
17      Svccinctissima & quantum phrasis Hebraica perm...
18      Nicolai Clenardi Institvtiones in Græcam lingv...
                              ...                        
2803    Volvmen praeclarissimvm ac in primis omnibvs i...
2805    Secunda secunde Sancti Thome : perdoctis ... T...
2807    Tractatvs de testibvs probandis vel reprobandi...
2809    Volvmen praeclarissimvm ac in primis omnibvs i...
2810    Andreas Tiraqvellvs De poenis legvm, ac consve...
Name: 245 $a - Title (NR), Length: 894, dtype: object

In [5]:
# apply word tokenization function of CLTK to the Latin book titles 
titletextlat.apply(word_tokenizerlat.tokenize)

# exclude the punctuations in the list of Latin titles
import string
wordlat = [line.translate(str.maketrans('', '', string.punctuation)).split() for line in titletextlat]

# set list of Latin stop words to be excluded from the tokenlist
# Latin stop list referring to: https://www.perseus.tufts.edu/hopper/stopwords
stop_words_string = '''ab, ac, ad, adhic, aliqui, aliquis, an, ante, apud, at, atque, aut, autem, cum, cur, de, deinde, dum, ego, enim, ergo, es, est, et, etiam, etsi, ex, fio, haud, hic, iam, idem, igitur, ille, in, infra, inter, interim, ipse, is, ita, magis, modo, mox, nam, ne, nec, necque, neque, nisi, non, nos, o, ob, per, possum, post, pro, quae, quam, quare, qui, quia, quicumque, quidem, quilibet, quis, quisnam, quisquam, quisque, quisquis, quo, quoniam, sed, si, sic, sive, sub, sui, sum, super, suus, tam, tamen, trans, tu, tum, ubi, uel, uero, unus, ut'''
stop_words = stop_words_string.split(', ')

# append words of book titles into a tokenlist
tokenlistlat = []
for group in wordlat:
    for token in group:
        #set the condition excluding words with one or two characters, which are to short for analysation
        if not token.lower() in stop_words and len(token.lower()) > 2 :
            tokenlistlat.append(token.lower())

tokenlistlat

['pavli',
 'iovii',
 'vita',
 'leonis',
 'decimi',
 'pont',
 'max',
 'libri',
 'qvatvor',
 'christi',
 'passione',
 'oratio',
 'psalmorvm',
 'omnivm',
 'ivxta',
 'hebraicam',
 'ueritatem',
 'paraphrastica',
 'interpretatio',
 'svccinctissima',
 'quantum',
 'phrasis',
 'hebraica',
 'permittit',
 'litteram',
 'proxime',
 'accedens',
 'paraphrasis',
 'concionem',
 'salomonis',
 'ecclesiastæ',
 'nicolai',
 'clenardi',
 'institvtiones',
 'græcam',
 'lingvam',
 'eiusdem',
 'meditationes',
 'græcanicæ',
 'artem',
 'grammaticam',
 'admiranda',
 'vere',
 'admiranda',
 'magnitvdine',
 'vrbis',
 'ecclesiae',
 'romanae',
 'andreae',
 'vesalii',
 'bruxellensis',
 'epistola',
 'rationem',
 'modumque',
 'propinandi',
 'radicis',
 'chymæ',
 'decocti',
 'pertractans',
 'praeter',
 'alia',
 'qvaedam',
 'epistolæ',
 'cuiusdam',
 'jacobum',
 'syluium',
 'sententiam',
 'recensens',
 'epistole',
 'caroli',
 'arte',
 'epistolandi',
 'vite',
 'leon',
 'decimo',
 'dadriano',
 'sesto',
 'del',
 'cardinal',
 'po

### 1-1. Generate frequency list of book titles in Latin

In [6]:
# append the value of token frequency into the frequency list 
freqlistlat = {}
for token in tokenlistlat:
    if token not in freqlistlat:
        freqlistlat[token] = 0
    freqlistlat[token] += 1

# sort the top 10 words with highest frequency
sorted(freqlistlat.items(), key=lambda x:x[1], reverse = True)[:11]

[('libri', 107),
 ('libros', 60),
 ('eiusdem', 54),
 ('liber', 53),
 ('aristotelis', 52),
 ('tres', 43),
 ('ioannis', 42),
 ('quibus', 39),
 ('francisci', 39),
 ('omnia', 36),
 ('petri', 31)]

In [7]:
# export the frequency list of Latin titles as csv file for visulisation in Tableau Public
import csv

with open('latinfreq.csv', 'w', encoding="utf-8") as f:
    for key in freqlistlat.keys():
        f.write("%s,%s\n"%(key,freqlistlat[key]))

### 1-2. Generate collocation bigram of book titles in Latin

In [8]:
# generate a collocation bigram with functions in NLTK
import nltk
from nltk.collocations import BigramCollocationFinder
finderlat = BigramCollocationFinder.from_words(tokenlistlat)
bigram_measures = nltk.collocations.BigramAssocMeasures()

# find the top 10 collocated words in Latin titles
finderlat.nbest(bigram_measures.likelihood_ratio, 10)

[('libri', 'tres'),
 ('tomus', 'primus'),
 ('pont', 'max'),
 ('societatis', 'iesv'),
 ('onvphrii', 'panvinii'),
 ('dvns', 'scoti'),
 ('pars', 'prima'),
 ('librum', 'sententiarum'),
 ('totius', 'anni'),
 ('commentariorvm', 'theologicorvm')]

## 2. Text analysis of book titles in Italian

In [9]:
# extract records with Italian as book language
dfita = df.loc[df['041 $a - Language code of text/sound track or separate title (R)'] == 'Italian' ]

# extract the values of book titles in Italian from the table
titletextita = dfita['245 $a - Title (NR)']
titletextita

0                        Aritmetica prattica facilissima,
8       Le vite di Leon decimo et d'Adriano sesto ... ...
13      Rodolfo Agricola Frisio Della invention dialet...
29      Giovan Lodovico Vives ... De l'vfficio del mar...
140     Espositione del R. P. frate Francesco Titelman...
                              ...                        
2695    Petrarcha con doi cõmenti sopra li Sonetti & C...
2699    Il dvello di M. Dario Attendoli con le avttori...
2700    Eversio dæmonum e corporibvs oppressis, cùm di...
2721    Mistico tempio del rosario con fiori, & frutti...
2750    Cosimo Bartoli ... Del modo di misvrare le dis...
Name: 245 $a - Title (NR), Length: 236, dtype: object

In [10]:
# apply word tokenization function of NLTK module to the Italian book titles 
titletextita.apply(word_tokenize, language='Italian')

# exclude the punctuations in the list of Latin titles
import string
wordita = [line.translate(str.maketrans('', '', string.punctuation)).split() for line in titletextita]

# set list of Italian stop words to be excluded from the tokenlist with stopwords function in NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words('Italian'))

# append words of book titles into a tokenlist
tokenlistita = []
for group in wordita:
    for token in group:
        #set the condition excluding words with one or two characters, which are to short for analysation
        if token.lower() not in stop_words and token.lower() and len(token.lower()) > 2:
            tokenlistita.append(token.lower())

tokenlistita

['aritmetica',
 'prattica',
 'facilissima',
 'vite',
 'leon',
 'decimo',
 'dadriano',
 'sesto',
 'cardinal',
 'pompeo',
 'colonna',
 'rodolfo',
 'agricola',
 'frisio',
 'invention',
 'dialettica',
 'giovan',
 'lodovico',
 'vives',
 'lvfficio',
 'marito',
 'listitvtione',
 'femina',
 'ammaestrare',
 'fancivlli',
 'arti',
 'liberali',
 'espositione',
 'frate',
 'francesco',
 'titelmano',
 'misteri',
 'cerimonie',
 'quali',
 'osseruano',
 'santissimo',
 'sacrificio',
 'messa',
 'secondo',
 'lordine',
 'delli',
 'santi',
 'antichi',
 'padri',
 'due',
 'espositioni',
 'sacro',
 'canone',
 'vita',
 'honesta',
 'virtvosa',
 'lettere',
 'amorose',
 'thomae',
 'argentina',
 'commentaria',
 'iiii',
 'libros',
 'sententiarvm',
 'miracoli',
 'santissimo',
 'sacramento',
 'congivratione',
 'gheldresi',
 'contra',
 'citta',
 'danversa',
 'sic',
 'valerii',
 'maximi',
 'dictorum',
 'factorum',
 'memorabilium',
 'libri',
 'novem',
 'vite',
 'leon',
 'decimo',
 'dadriano',
 'sommi',
 'pontefici',
 'car

### 2-1. Generate frequency list of book titles in Italian

In [11]:
# append the value of token frequency into the frequency list 
freqlistita = {}
for token in tokenlistita:
    if token not in freqlistita:
        freqlistita[token] = 0
    freqlistita[token] += 1

# sort the top 10 words with highest frequency
sorted(freqlistita.items(), key=lambda x:x[1], reverse = True)[:11]        

[('libri', 33),
 ('sopra', 29),
 ('libro', 17),
 ('trattato', 16),
 ('tre', 16),
 ('discorso', 15),
 ('dvello', 14),
 ('principi', 14),
 ('battista', 13),
 ('lettere', 12),
 ('vita', 11)]

In [12]:
# export the frequency list of Italian titles as csv file for visulisation in Tableau Public

with open('Italianfreq.csv', 'w', encoding="utf-8") as f:
    for key in freqlistita.keys():
        f.write("%s,%s\n"%(key,freqlistita[key]))

### 2-2. Generate collocation bigram of book titles in Italian

In [13]:
# generate a collocation bigram with functions in NLTK
finderita = BigramCollocationFinder.from_words(tokenlistita)

# find the top 10 collocated words in Latin titles
finderita.nbest(bigram_measures.likelihood_ratio, 10)

[('accessere', 'porrò'),
 ('capponi', 'porrecta'),
 ('cvm', 'elvcidationibvs'),
 ('editis', 'accessere'),
 ('elvcidationibvs', 'formalibvs'),
 ('formalibvs', 'seraphinvm'),
 ('luculentissima', 'subtilissimaq́'),
 ('mvtio', 'ivstinopolitano'),
 ('porrecta', 'editis'),
 ('porrò', 'luculentissima')]

# General interactive visulasation

The interactive storyboard created with Tableau Public for the exploration of the Italian-published Ancient Book Collection in Leuven Libraries is embedded as follows. 

It consists of three aspects of the collection in separated dashboards:
1. Overview of library collections
2. The distribution of books in collection
3. Title frequencies of the two major book languages, Latin and Italian

Users can easily travel between dashboards for an overview of each aspect. And additionally, the statistics of the dashboards were set an interactive filtering function that allows users to see the difference and trend on different filtering conditions.

For instance, in the Library Collection dashboard, users can click on library names to see the carrier/media type, book period and book language composition of the book collection in the chosen library respectively. And in the Collection Distribution dashboard, users can click on different publication periods to see the differences in each attribute. While in the dashboard of Book Title Frequency, there is a volume bar for users to choose the range of frequency and see the corresponding titles.

In [14]:
%%HTML
<div class='tableauPlaceholder' id='viz1673213361373' style='position: relative'><noscript><a href='#'><img alt='Italian-published Ancient Book Collection in Leuven Libraries ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;It&#47;Italian-publishedAncientBookCollectioninLeuvenLibraries&#47;StoryBoard&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='Italian-publishedAncientBookCollectioninLeuvenLibraries&#47;StoryBoard' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;It&#47;Italian-publishedAncientBookCollectioninLeuvenLibraries&#47;StoryBoard&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='language' value='en-US' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1673213361373');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='1016px';vizElement.style.height='991px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>