<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/notebooks/students/BDSS_TD3_SAX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 3

Welcome to the TD 3. This part will cover SAX


## Preambule

In [1]:
from lxml import etree
import re
from xml.dom.minidom import parse
import xml.sax

# Functions to work with XML files

def validate_xml(xml_path:str, dtd_path:str) -> bool:
    ''' Validate an XML file  against a DTD using the lxml library
    '''
    try:
        dtd = etree.DTD(open(dtd_path))
    except etree.DTDParseError as ed:
        print(f"DTDParseError: {ed}")
        for i, er in enumerate(ed.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    try:
        xml_doc = etree.parse(xml_path)
    except etree.XMLSyntaxError as e:
        print(f"XMLSyntaxError: {e}")
        for i, er in enumerate(e.error_log):
            print(f"\t{i}-> {er.message}, at line {er.line}")
        etree.clear_error_log()
        return False

    result = dtd.validate(xml_doc)
    if not result: print(dtd.error_log[0])

    return result

def write_xml_dtd_files_from_strings(xml_strings, dtd_strings, identifiers = None):
    ''' Write a list of strings into files. This strings should be XML and DTD files
    '''

    # If single strings are given, encapsulate them in lists  
    if all(map(lambda o: isinstance(o, str), [xml_strings, dtd_strings])):
        xml_strings, dtd_strings = [xml_strings], [dtd_strings]

    if len(xml_strings) != len(dtd_strings):
        raise Exception("Different number of XML and DTD strings!")

    # If no identifiers are given, create default ones. This determines file names
    if identifiers is None:
        identifiers = [f"file_{i}" for i in range(len(xml_strings))]

    try:
        for x, d, id in zip(xml_strings, dtd_strings, identifiers):
            xml_path, dtd_path = f"{id}.xml", f"{id}.dtd" 
            with open(xml_path,"w") as f:
                f.write(x)
            with open(dtd_path,"w") as f:
                f.write(d)
    except Exception as e:
        print("Problems while writing XML and DTD files")
        raise e

    return identifiers



def test_validation(xml_string, dtd_string, validator):
    ''' Validate an XML document against a DTD, both given as strings
    '''
    # Write files
    write_xml_dtd_files_from_strings(xml_string, dtd_string, identifiers = ['temp'])
    
    # Validate
    return validator("temp.xml", "temp.dtd" )

def xpath_query_xml_string(xml_string, query_string):
    xml_path = "xml_doc.xml"
    with open(xml_path, "w") as f:
        # Remove all whitespaces to keep the 'real' text of each node
        f.write(re.sub(">[\s|\n]*<", "><", xml_string))
        f.close()
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)

def xpath_query_xml_file(xml_path, query_string):
    xml_doc = etree.parse(xml_path)
    query = etree.XPath(query_string)
    return query(xml_doc)


def print_xpath_query_results(results):
    print(f"Total results: {len(results)}")
    print("*"*20 + "\n")
    for e in results:
        try:        
            print(f"node tag: {e.tag}")
            print(f"node text: *{e.text}*")
            print(', '.join([f"{k} = {v}"for k, v in e.items()]))
            print("-"*20)
        except:
            print("--Except")
            print(e)


## SAX
Download the FILMS dataset

In [2]:
dtd_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.dtd"
xml_link = "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml"

!rm "./films.dtd"
!rm "./films.xml"

# Download the imdb sample file
!wget {dtd_link}
!wget {xml_link}

# If the download fails, you will have to load the files into the Colab session. 
# Go to the Files section on the left panel

if validate_xml("films.xml", "films.dtd"):
    print("Files were downloaded correctly")

--2022-02-22 10:31:00--  https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.dtd
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 626 [text/plain]
Saving to: ‘films.dtd’


2022-02-22 10:31:05 (23,9 MB/s) - ‘films.dtd’ saved [626/626]

--2022-02-22 10:31:05--  https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43462 (42K) [text/plain]
Saving to: ‘films.xml’


2022-02-22 10:31:05 (11,5 MB/s) - ‘films.xml’ saved [43462/43462]

Files 

### Example 1: Print only certain elements

In [3]:
class PrinterContentHandler(xml.sax.ContentHandler):
    def __init__(self, tags_to_print = None):
        super().__init__()
        self.tags_to_print = tags_to_print
        

    def startElement(self, name, attrs):
        if not self.tags_to_print is None and name in self.tags_to_print:
            print("startElement: '" + name + "'")
        return

    def endElement(self, name):
        print("endElement: '" + name + "'\n")
        return

    def characters(self, content):
        # When text is encountered
        print("Characters: " + content + "")
        return


tags_to_print = ["FILM"]
handler = PrinterContentHandler(tags_to_print = tags_to_print)

path = "films.xml"
f = open(path)

xml.sax.parse(f, handler)

Characters: 

startElement: 'FILM'
Characters: 

Characters: Vertigo
endElement: 'TITRE'

Characters: 

Characters: Drame
endElement: 'GENRE'

Characters: USA
endElement: 'PAYS'

endElement: 'MES'

Characters: 

Characters: 

Characters: James
endElement: 'PRENOM'

Characters: Stewart
endElement: 'NOM'

Characters: 

Characters: John Ferguson
endElement: 'INTITULE'

endElement: 'ROLE'

Characters: Kim
endElement: 'PRENOM'

Characters: Novak
endElement: 'NOM'

Characters: 

Characters: Madeleine Elster
endElement: 'INTITULE'

endElement: 'ROLE'

endElement: 'ROLES'

Characters: 

Characters: Scottie Ferguson, ancien inspecteur de police, est sujet au vertige depuis qu'il a vu mourir son collègue. Elster, son ami, le charge de surveiller sa femme, Madeleine, ayant des tendances suicidaires. Amoureux de la jeune femme Scottie ne remarque pas le piège qui se trame autour de lui et dont il va être la victime... 
endElement: 'RESUME'

Characters: 

endElement: 'FILM'

Characters: 

startElem

### Example 2: Get the titles

In [4]:
class GetTextInsideTag(xml.sax.ContentHandler):
    def __init__(self, tag = None):
        super().__init__()
        self.tag = tag
        self.reading = False
        self.buffer = []
        self.result = []
        

    def startElement(self, name, attrs):
        if not self.tag is None and name == self.tag:
            self.reading = True
        return


    def endElement(self, name):
        if self.buffer:
            # If something was read, then add it to results
            # before reseting the buffer
            self.result.append(' '.join(self.buffer))
        self.reading = False
        self.buffer = []
        return

    def characters(self, content):
        # When text is encountered
        # print("Characters '" + content + "'")
        if self.reading: self.buffer.append(content)
        return


tag = "TITRE"
handler = GetTextInsideTag(tag = tag)

path = "films.xml"
f = open(path)

xml.sax.parse(f, handler)

print(handler.result)

['Vertigo', 'Alien', 'Titanic', 'Sacrifice', 'Volte/Face', 'Sleepy Hollow', 'American Beauty', 'Impitoyable', 'Gladiator', 'Blade Runner', 'Piège de cristal', '58 minutes pour vivre', 'Van Gogh', 'Seven', "L'armée des douze singes", 'Le nom de la rose', 'Pulp fiction', 'Mary à tout prix', 'Terminator', 'Les dents de la mer', 'Le silence des agneaux', "Le prince d'Egypte", 'Godzilla', 'Matrix', 'Mission: Impossible', 'Kagemusha', 'Les pleins pouvoirs', 'Le gendarme et les extra-terrestres', 'Les frères pétards', 'Le monde perdu', 'Rain Man', 'Top Gun', 'Les bronzés font du ski', 'MICROCOSMOS', 'Psychose', 'Le retour du Jedi', 'Les oiseaux', 'Reservoir dogs', 'Eyes Wide Shut', 'Shining', 'Pas de printemps pour Marnie', 'Fenêtre sur cour', 'La mort aux trousses', "Jeanne d'Arc", 'Le cinquième élément', 'Léon', 'Nikita', 'Le grand bleu']


### Example 3: Get the titles of films staring some artist

In [6]:
class GetFilmsByArtistInCast(xml.sax.ContentHandler):
    def __init__(self, prenom, nom):
        super().__init__()
        self.prenomSearch = prenom
        self.nomSearch = nom

        self.result = []
        self.titleBuffer = []
        self.prenomBuffer = []
        self.nomBuffer = []

        self.titleTemp = None
        self.prenomTemp = None
        self.nomTemp = None
        
        self.whereInDoc = None

        self.reading = False
        

    def startElement(self, name, attrs):
        self.whereInDoc = name
        if name in ["PRENOM", "NOM", "TITRE"]:
            self.reading = True
        return

    def endElement(self, name):
        # If we read something that we need, then get the contentn and use it
        if name == "TITRE":
            self.titleTemp = ' '.join(self.titleBuffer)
            self.titleBuffer = []
        if name == "PRENOM":
            self.prenomTemp = ' '.join(self.prenomBuffer)
            self.prenomBuffer = []
        if name == "NOM":
            self.nomTemp = ' '.join(self.nomBuffer)
            self.nomBuffer = []

        # If we end reading a ROLE element, we can check if it contains the 
        # artist we want
        if name == "ROLE":
            if self.prenomSearch == self.prenomTemp and self.nomSearch == self.nomTemp:
                self.result.append(self.titleTemp)
        self.reading = False
        return

    def characters(self, content):
        if self.reading:
        # If we are in TITLE, we have to save it
            if self.whereInDoc == "TITRE": self.titleBuffer.append(content)
            if self.whereInDoc == "PRENOM": self.prenomBuffer.append(content)
            if self.whereInDoc == "NOM": self.nomBuffer.append(content)
        return

    def endDocument(self):
        self.titleBuffer = []
        self.prenomBuffer = []
        self.nomBuffer = []


prenom, nom = "Bruce", "Willis"
handler = GetFilmsByArtistInCast(prenom = prenom, nom = nom)

path = "films.xml"
f = open(path)

xml.sax.parse(f, handler)

print(handler.result)

['Piège de cristal', '58 minutes pour vivre', "L'armée des douze singes", 'Pulp fiction', 'Le cinquième élément']


### Ex 1: Queries with SAX on the movie dataset

Try to do some other queries using SAX.
Compare your results (and maybe even running times!) with other tools like XPath or DOM.

I suggest the queries 1 to 8, and then query 11

## Data science with text

Here I want to introduce you to some more advanced topics in Data Science and Machine Learning.

We will use it as an excuse to practice SAX


---



**Bag of words and TF-IDF**

In Data Science and Machine Learning, particularly in Natural Language Processing, the objects to study are text documents. There are different ways to study them, but from a mathematical perspective we need ways of encoding such text documents into more "vectorial" data

The question becomes: ***How do you transform a piece of text into a vector to apply your algorithms on them?***


One very common example is sentiment analysis. The basic idea is that you want to know if some text (for example a movie review or a tweet) is positive or negative towards a subject. This can be seen as a classification problem.


The initial approach to turn a piece of text into a vector is the Bag of Words, where you characterize a document by the words that appear in it and their frequence. A more sophisticated approach can be TF-IDF that takes into account the number of words in each document and also the relative frequence of words across documents.

See this site for a detailed and simple exmplanation if you have doubts

https://www.analyticsvidhya.com/blog/2020/02/quick-introduction-bag-of-words-bow-tf-idf/


In this excercise we will try to create such document vectors for our movie dataset using SAX as a way to read the data.

### Bag of words

We need to go through the document, get the RESUME and build a vocabulary that contains all the words present in all the RESUMEs.

We do not care about every word. We will remove unecessary words using a special library

We are going to optimize our code and create the bag of words as we go through the resumes

In [10]:
''' 2.1.1 
Write a function that given some text, it eliminates all non important words
and returns a list of words representing the text
We will use a library called spacy for the stop words in french and we can use
another library called gensim to help us do some other preprocessing

'''

import spacy
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
import gensim.parsing.preprocessing as prep
import re

CUSTOM_FILTERS = [lambda x: x.lower(), lambda x: re.sub('\W+',' ',x) ,\
                    prep.strip_tags, prep.strip_punctuation, \
                    prep.strip_multiple_whitespaces, \
                    prep.strip_numeric, \
                    prep.strip_short]
our_prep_func = lambda x: prep.preprocess_string(x, CUSTOM_FILTERS)


# Traiter le texte
resume = "Pulp Fiction décrit l'odyssée sanglante et burlesque de petits malfrats \
dans la jungle de Hollywood, ou s'entrecroisent les destins de deux petits \
tueurs, d'un dangereux gangster marié à une camée, d'un boxeur roublard, de \
prêteurs sur gages sadiques, d'un caïd élégant et dévoué, d'un dealer bon \
mari et de deux tourtereaux à la gachette facile..."

# Lets test the two approaches to see the difference
new = [s for s in resume.split() if s not in fr_stop]
print(new)

new = [s for s in our_prep_func(resume) if s not in fr_stop]
print(new)


# This is the actual function
def process_text(text, processing, stopwords = fr_stop):
    return [s for s in processing(text) if s not in stopwords]


print("Our final function")
new = process_text(resume, our_prep_func, fr_stop)
print(new)

['Pulp', 'Fiction', 'décrit', "l'odyssée", 'sanglante', 'burlesque', 'petits', 'malfrats', 'jungle', 'Hollywood,', "s'entrecroisent", 'destins', 'petits', 'tueurs,', "d'un", 'dangereux', 'gangster', 'marié', 'camée,', "d'un", 'boxeur', 'roublard,', 'prêteurs', 'gages', 'sadiques,', "d'un", 'caïd', 'élégant', 'dévoué,', "d'un", 'dealer', 'bon', 'mari', 'tourtereaux', 'gachette', 'facile...']
['pulp', 'fiction', 'décrit', 'odyssée', 'sanglante', 'burlesque', 'petits', 'malfrats', 'jungle', 'hollywood', 'entrecroisent', 'destins', 'petits', 'tueurs', 'dangereux', 'gangster', 'marié', 'camée', 'boxeur', 'roublard', 'prêteurs', 'gages', 'sadiques', 'caïd', 'élégant', 'dévoué', 'dealer', 'bon', 'mari', 'tourtereaux', 'gachette', 'facile']
Our final function
['pulp', 'fiction', 'décrit', 'odyssée', 'sanglante', 'burlesque', 'petits', 'malfrats', 'jungle', 'hollywood', 'entrecroisent', 'destins', 'petits', 'tueurs', 'dangereux', 'gangster', 'marié', 'camée', 'boxeur', 'roublard', 'prêteurs', '

In [11]:
''' 2.1.2
Defining the word count of a document
Write a function that given a list of words and a vocabulary, computes the word
count representation of the text
The vocabulary will be represented as a dictionary containing pairs (word, index)
where index is the position of the word in the vocabulary

We will suppose that all the words are in the vocabulary
'''

def bag_of_words(tokens, vocab):
    ''' Your code here '''
    return 


# Test it with a simple example
tokens = ["test", "sentence", "test", "test", "horse", "sentence"]
vocab = {"sentence":0, "horse":1, "test":2}
bow = bag_of_words(tokens, vocab)
print(bow)



None


In [12]:
''' 2.1.3
Expanding a vocabulary
When we read new text, some new words might appear. We need to add them to the
vocabulary we are considering.
Write a function that given a vocabulary and some new text (already preprocessed),
adds the new words to the vocabulary (if there are new words)
'''

def expand_vocab(vocab, new_tokens):
    ''' Your code here '''
    return 


# Test
tokens = ["test", "new_word", "horse", "more_novelty"]
vocab = {"sentence":0, "horse":1, "test":2}
expand_vocab(vocab, tokens)
print(vocab)

{'sentence': 0, 'horse': 1, 'test': 2}


In [15]:
''' 2.1.4
Put it all together
Now use the functions we created and SAX to build the bag of words representation
for all the resumes

Note: For the tests, use this dummy dataset so that we can see something
    https://universitedauphine-my.sharepoint.com/:u:/g/personal/lucas_gnecco-heredia_universitedauphine_onmicrosoft_com/EV8zxLKQpo5NgV4KUoOllWUBXgBWAOuZoq-OjIBTEsc_YQ?e=dMk2td

Note: We will take care of the length of the vectors later
'''

class ResumeBagOfWordsSax(xml.sax.ContentHandler):
    def __init__(self):
        super().__init__()
        self.vectors = {}
        self.vocab = {}
        
    def startElement(self, name, attrs):
        return 

    def endElement(self, name):
        return

    def characters(self, content):
        return



handler = ResumeBagOfWordsSax()

In [16]:
# Download the dummy file
!wget "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/imdb_simple_example.xml"

--2022-02-22 10:34:51--  https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/imdb_simple_example.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1219 (1,2K) [text/plain]
Saving to: ‘imdb_simple_example.xml’


2022-02-22 10:34:51 (34,7 MB/s) - ‘imdb_simple_example.xml’ saved [1219/1219]



In [18]:
path = "imdb_simple_example.xml"
f = open(path)

xml.sax.parse(f, handler)

print(*handler.vectors.items(), sep="\n")




In [19]:
'''
Lets fill the vectors with zeros to get them to the right length
'''
N = len(handler.vocab)
for k, v in handler.vectors.items():
    v += [0]*(N-len(v))

print("New vectors after resize")
print(*handler.vectors.items(), sep="\n")

New vectors after resize



In [21]:
''' 
We can create a table to better understand each vector

NOTE: This only makes sense with dummy datasets
'''

import pandas as pd
data = pd.DataFrame(data = handler.vectors, index = handler.vocab.keys())

data

### TF-IDF
Now that we have the Bag of Words for each resume, we can create the TF-IDF representations

This can be done in serveral ways, and you can try the one you think is more fun



*   Use plain Python
*   Use *numpy* (vectors, vector operations)
*   Use *pandas* (easiest way IMO)

