<a href="https://colab.research.google.com/github/lucasgneccoh/BDSS_Dauphine/blob/main/BDSS_TD7_PostgreSQLJSON_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bases de données semi-structurées - TD 7

Welcome to the support Python notebook for this TD. This notebook follows the paper version of the TD. 

The idea is to make the same exercises in a more interactive way, practice some Python and also discover or practice with Google Colab notebooks.

# Database setup

## Install PostgreSQL

In [2]:
# install
!apt install postgresql postgresql-contrib &>log
!service postgresql start
!sudo -u postgres psql -c "CREATE USER root WITH SUPERUSER"
# set connection
%load_ext sql
%config SqlMagic.feedback=False 
%config SqlMagic.autopandas=True
%sql postgresql+psycopg2://@/postgres

 * Starting PostgreSQL 10 database server
   ...done.
CREATE ROLE


  """)


'Connected: @postgres'

## Create tables and insert data
First get data from the original XML format and transform it to JSON
Then insert it in the PostgreSQL tables

In [3]:
!wget "https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml"
path = "films.xml"

from lxml import etree
import re
from xml.dom.minidom import parse
import copy

dom = parse("films.xml")
filmTextElems = ["TITRE", "GENRE", "PAYS", "RESUME"]
artistTextElems = ["ACTNOM", "ACTPNOM", "ANNEENAISS"]
roleTextElements = ["NOM", "PRENOM", "INTITULE"]

def getText(node):
    try:
        return node.childNodes[0].data
    except Exception as e:  
        print(f'Problems getText with node {node.tagName}')
        raise e

def getAttributes(node):
    res = {}
    if node.hasAttributes():
        for k, v in node.attributes.items():
            res[k] = v
    return res

def getTextElements(node, elements):
    res = {}
    for elem in elements:
        for t in node.getElementsByTagName(elem):
            if t.hasChildNodes():
                res[elem] = getText(t)
            
    return res

# Get films
films = []
for f in dom.getElementsByTagName("FILM"):
    film = getTextElements(f, filmTextElems)
    film.update(getAttributes(f))

    # Read MES
    for m in f.getElementsByTagName('MES'):
        film["MES"] = m.getAttribute('id_mes')
    
    # Read ROLES
    roles = []
    for r in f.getElementsByTagName('ROLE'):
        roles.append(getTextElements(r, roleTextElements))
    
    film.update({'ROLES':  copy.deepcopy(roles)})

    # I created a special TITRE. I have to create it
    film["TITRE"] = {
        "title": film["TITRE"],
        "lang": "@fr",
        "note": "Lorem ipsum"
    }

    films.append(film)


# Get artists
artists = []
for a in dom.getElementsByTagName("ARTISTE"):
    artist = getTextElements(a, artistTextElems)
    artist.update(getAttributes(a))
    artists.append(artist)



FILMS = {'arrArtistes':artists , 'arrFilms':films}

--2022-02-23 09:17:58--  https://raw.githubusercontent.com/lucasgneccoh/BDSS_Dauphine/main/data/films.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 43462 (42K) [text/plain]
Saving to: ‘films.xml’


2022-02-23 09:17:58 (4.03 MB/s) - ‘films.xml’ saved [43462/43462]



In [4]:
for f in films:
    for r in f["ROLES"]:
        if r["NOM"] == "Willis":
            print(f)

{'TITRE': {'title': 'Piège de cristal', 'lang': '@fr', 'note': 'Lorem ipsum'}, 'GENRE': 'Action', 'PAYS': 'USA', 'RESUME': "John Mc Clane, policier new-yorkais, vient passer Noel a Los Angeles aupres de sa femme. Dans le building ou elle travaille, il se retrouve temoin de la prise en otage de tout le personnel par 12 terroristes. Objectif de ces derniers, vider les coffres de la societe. Cache mais isole, il entreprend de prevenir l'exterieur...", 'Annee': '1988', 'MES': '_26', 'ROLES': [{'NOM': 'Willis', 'PRENOM': 'Bruce', 'INTITULE': 'McClane'}]}
{'TITRE': {'title': '58 minutes pour vivre', 'lang': '@fr', 'note': 'Lorem ipsum'}, 'GENRE': 'Action', 'PAYS': 'USA', 'RESUME': "\nVenu attendre sa femme a l'aéroport, le policier John McClane remarque la présence de terroristes qui ont pris le contrôle des pistes, empêchant tout avion d'atterrir et menaçant de laisser les appareils en vol tourner jusqu'à épuisement de leur kérosène. John n'a devant lui que 58 minutes pour éviter la catastr

In [5]:
%%sql

DROP TABLE IF EXISTS artistsSQL;
DROP TABLE IF EXISTS filmsSQL; 

CREATE TABLE filmsSQL (
	id serial NOT NULL PRIMARY KEY,
	data json NOT NULL
);
CREATE TABLE artistsSQL (
	id serial NOT NULL PRIMARY KEY,
	data json NOT NULL
);

 * postgresql+psycopg2://@/postgres


In [6]:
# Be careful with the ' character
a = "retrouve l'un de ses"
b = re.sub("\'","''", a)
print(b)

retrouve l''un de ses


In [7]:
%%capture
import json
import re


for f in FILMS["arrArtistes"]:
    json_string = re.sub("\'","''", json.dumps(f))
    cmd = f'''INSERT INTO artistsSQL (data) VALUES('{json_string}')'''
    %sql $cmd;

for f in FILMS["arrFilms"]:
    json_string = re.sub("\'","''", json.dumps(f))
    cmd = f'''INSERT INTO filmsSQL (data) VALUES('{json_string}')'''
    %sql $cmd;

## Make simple SELECT statements to see if the data is right

In [8]:
%%sql
SELECT *
FROM artistsSQL
LIMIT 3
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,id,data
0,1,"{'ACTNOM': 'Cameron', 'ACTPNOM': 'James', 'ANN..."
1,2,"{'ACTNOM': 'Hitchcock', 'ACTPNOM': 'Alfred', '..."
2,3,"{'ACTNOM': 'Scott', 'ACTPNOM': 'Ridley', 'ANNE..."


In [9]:
%%sql
SELECT data -> 'ACTNOM' as nom,
        data -> 'ACTPNOM' as prenom,
        data -> 'ANNEENAISS' as anneN
FROM artistsSQL
WHERE CAST(data ->> 'ANNEENAISS' as INTEGER) > 1950
LIMIT 5
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,nom,prenom,annen
0,Cameron,James,1954
1,Travolta,John,1954
2,Cage,Nicolas,1964
3,Burton,Tim,1958
4,Willis,Bruce,1955


# Exercises

Now that we have our data in PostgreSQL we can do a lot of things!

We can do almost everything we know in standard SQL if we are able to create the right tables from our JSON data.

On top of that, PostgreSQL has a lot of functions to deal with JSON objects that make this approach a lot easier and powerful.

Here are some examples and documentation that can be helpful:


https://www.postgresqltutorial.com/postgresql-json/

https://www.postgresql.org/docs/current/functions-json.html



## Exercise 1: Simple queries we already know

Do queries 2, 4 and 8. They should not be that hard

Query 2: Films released in 1980

Query 4: Films with Bruce Willis in it

Query 8: Role of Harvey Keitel in Reservoir Dogs

\\

If you want something a bit more challenging, try to do someething general. For example in Query 4, what if I want to look for some other artist?

In [10]:
# Query 2: Films released in 1980

%%sql
SELECT data -> 'TITRE' -> 'title' as title,
        CAST(data ->> 'Annee' as INTEGER) as year
FROM filmsSQL
WHERE CAST(data ->> 'Annee' as INTEGER) = 1980
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,title,year
0,Kagemusha,1980
1,Shining,1980


In [38]:
# Query 4: Films with Bruce Willis in it
%%sql
SELECT  title, year,
        roles -> 'INTITULE' as intitule
FROM (
    SELECT  data -> 'TITRE' -> 'title' as title,
            CAST(data ->> 'Annee' as INTEGER) as year,
            json_array_elements ( data -> 'ROLES' )  as roles
    FROM filmsSQL
    
) AS TMP

WHERE LOWER(roles ->> 'NOM') LIKE 'willis' AND LOWER(roles ->> 'PRENOM') LIKE 'bruce' 
;

 * postgresql+psycopg2://@/postgres
(psycopg2.ProgrammingError) syntax error at or near "$"
LINE 11: WHERE LOWER(roles ->> 'NOM') LIKE $nom AND LOWER(roles ->> '...
                                           ^

[SQL: SELECT  title, year,
        roles -> 'INTITULE' as intitule
FROM (
    SELECT  data -> 'TITRE' -> 'title' as title,
            CAST(data ->> 'Annee' as INTEGER) as year,
            json_array_elements ( data -> 'ROLES' )  as roles
    FROM filmsSQL
    
) AS TMP

WHERE LOWER(roles ->> 'NOM') LIKE $nom AND LOWER(roles ->> 'PRENOM') LIKE 'bruce' 
;]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [37]:
prenom, nom = 'bruce', 'willis'

query = f"""
SELECT  title, year,
        roles -> 'INTITULE' as intitule
FROM (
    SELECT  data -> 'TITRE' -> 'title' as title,
            CAST(data ->> 'Annee' as INTEGER) as year,
            json_array_elements ( data -> 'ROLES' )  as roles
    FROM filmsSQL
    
) AS TMP

WHERE LOWER(roles ->> 'NOM') LIKE '{nom}' AND LOWER(roles ->> 'PRENOM') LIKE '{prenom}' 
"""

%sql $query;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,title,year,intitule
0,Piège de cristal,1988,McClane
1,58 minutes pour vivre,1990,McLane
2,L'armée des douze singes,1995,Cole
3,Pulp fiction,1994,Butch Coolidge
4,Le cinquième élément,1997,Major Korben Dalla


In [39]:
# Query 8: Role of Harvey Keitel in Reservoir Dogs

%%sql
SELECT  title, year,
        roles -> 'INTITULE' as intitule
FROM (
    SELECT  data -> 'TITRE' -> 'title' as title,
            CAST(data ->> 'Annee' as INTEGER) as year,
            json_array_elements ( data -> 'ROLES' )  as roles
    FROM filmsSQL
    WHERE LOWER(data -> 'TITRE' ->> 'title') LIKE 'reservoir dogs'
    
) AS TMP

WHERE LOWER(roles ->> 'NOM') LIKE 'keitel' AND LOWER(roles ->> 'PRENOM') LIKE 'harvey' 
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,title,year,intitule
0,Reservoir dogs,1992,Mr. White/Larry


## Exercise 2: More complex queries we have maybe talked about

Let's use the nice SQL syntax to JOIN the two tables we have (films and artists)

### Exercise 2.1: For every movie, show its title, year and the information about the director

In [None]:
# '''
#     First get the artist data in a normal table
# '''
%%sql

SELECT  data ->> 'ACTPNOM' as prenom,
        data ->> 'ACTNOM' as nom,
        data ->> 'ANNEENAISS' as yearBirth,
        data ->> 'id_art' as id_art
FROM artistsSQL
;

In [None]:
# '''
#     Do the same with the films
# '''
%%sql

SELECT  data ->> 'Annee' as yearFilm,
        data -> 'TITRE' ->> 'title' as title,
        data ->> 'MES' as id_art
FROM filmsSQL
;

In [None]:
# '''
#     Join them. Remember the basic JOIN syntax
#     SELECT table1.column1, table2.column2...
#     FROM table1
#     INNER JOIN table2
#     ON table1.common_filed = table2.common_field;
# '''
%%sql
SELECT title, yearFilm, prenom, nom FROM
    (
        SELECT  data ->> 'ACTPNOM' as prenom,
                data ->> 'ACTNOM' as nom,
                data ->> 'ANNEENAISS' as yearBirth,
                data ->> 'id_art' as id_art
        FROM artistsSQL
    ) as TMP_ART 
    INNER JOIN
    ( 
        SELECT  data ->> 'Annee' as yearFilm,
                data -> 'TITRE' ->> 'title' as title,
                data ->> 'MES' as id_art
        FROM filmsSQL
    ) as TMP_FILMS
    ON TMP_ART.id_art = TMP_FILMS.id_art
;

### Exercise 2.2: For each artist, count the participations on any film (as actor, not as a director)

In [66]:
%%sql

SELECT prenom, nom, COUNT(*) as num_roles
FROM
    (
        SELECT  title, year,
                roles ->> 'PRENOM' as prenom,
                roles ->> 'NOM' as nom,
                roles ->> 'INTITULE' as intitule
        FROM (
            SELECT  data -> 'TITRE' -> 'title' as title,
                    CAST(data ->> 'Annee' as INTEGER) as year,
                    json_array_elements ( data -> 'ROLES' )  as roles
            FROM filmsSQL
        ) AS TMP
    ) as TMP2
GROUP BY prenom, nom
ORDER BY num_roles DESC
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,prenom,nom,num_roles
0,Bruce,Willis,5
1,Tom,Cruise,4
2,Jean,Reno,4
3,Val,Kilmer,2
4,Jeff,Goldblum,2
...,...,...,...
79,Harvey,Keitel,1
80,Quentin,Tarantino,1
81,Kelly,McGillis,1
82,Nicole,Kidman,1


###  Exercise 2.3: For each artist, compute the average year of the films in which he/she has participated

In [72]:
%%sql

SELECT prenom, nom, ROUND(AVG(year),2) as avgYear
FROM
    (
        SELECT  title, year,
                roles ->> 'PRENOM' as prenom,
                roles ->> 'NOM' as nom,
                roles ->> 'INTITULE' as intitule
        FROM (
            SELECT  data -> 'TITRE' ->> 'title' as title,
                    CAST(data ->> 'Annee' as INTEGER) as year,
                    json_array_elements ( data -> 'ROLES' )  as roles
            FROM filmsSQL
        ) AS TMP
    ) as TMP2
GROUP BY prenom, nom
ORDER BY avgYear DESC
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,prenom,nom,avgyear
0,Russell,Crowe,2000.00
1,Christopher,Walken,1999.00
2,Laurence,Fishburne,1999.00
3,Keanu,Reeves,1999.00
4,Anette,Bening,1999.00
...,...,...,...
79,Cary,Grant,1959.00
80,James,Mason,1959.00
81,Eva Marie,Saint,1959.00
82,Kim,Novak,1958.00


###  Exercise 2.4: For each artist and each participation in a film, compute the age the artist had when he participated in the movie.

Filter out NaN values !

In [87]:
%%sql

SELECT title, TMP_FILMS.prenom, TMP_FILMS.nom, TMP_FILMS.yearFilm, yearBirth, yearFilm - yearBirth as age  
FROM
(
    (
        SELECT  data ->> 'ACTPNOM' as prenom,
                data ->> 'ACTNOM' as nom,
                CAST(data ->> 'ANNEENAISS' as INTEGER) as yearBirth,
                data ->> 'id_art' as id_art
        FROM artistsSQL
    ) as TMP_ART 
    INNER JOIN
    ( 
        SELECT  title, yearFilm,
                roles ->> 'PRENOM' as prenom,
                roles ->> 'NOM' as nom,
                roles ->> 'INTITULE' as intitule
        FROM (
                SELECT  data -> 'TITRE' ->> 'title' as title,
                        CAST(data ->> 'Annee' as INTEGER) as yearFilm,
                        json_array_elements ( data -> 'ROLES' )  as roles
                FROM filmsSQL
            ) AS TMP
    ) as TMP_FILMS
    ON TMP_ART.nom = TMP_FILMS.nom AND TMP_ART.prenom = TMP_FILMS.prenom
)
WHERE yearBirth <> double precision 'NaN'
ORDER BY TMP_FILMS.nom, TMP_FILMS.prenom
;

 * postgresql+psycopg2://@/postgres


Unnamed: 0,title,prenom,nom,yearfilm,yearbirth,age
0,Nikita,Jean-Hughes,Anglade,1990,1955,35
1,Le grand bleu,Rosanna,Arquette,1988,1959,29
2,Pulp fiction,Rosanna,Arquette,1994,1959,35
3,Les frères pétards,Josiane,Balasko,1986,1950,36
4,Les bronzés font du ski,Josiane,Balasko,1979,1950,29
...,...,...,...,...,...,...
78,Le cinquième élément,Bruce,Willis,1997,1955,42
79,L'armée des douze singes,Bruce,Willis,1995,1955,40
80,58 minutes pour vivre,Bruce,Willis,1990,1955,35
81,Piège de cristal,Bruce,Willis,1988,1955,33
