In [2]:
import pandas as pd

# List of French Writers abritrarily defined and chosen in the 17th, 18th and 19th century
authors = [
    # 17th century
    "Honoré d'Urfé", "Madeleine de Scudéry", "Paul Scarron", "Jean de La Fontaine",
    "Madame de Lafayette", "Charles Sorel", "Gauthier de Costes de La Calprenède",
    "Vincent Voiture", "Jean-Pierre Camus", "Marie-Catherine d'Aulnoy",
    
    # 18th century
    "Montesquieu", "Voltaire", "Jean-Jacques Rousseau", "Denis Diderot", "Marivaux",
    "Claude Crébillon", "Louis-Sébastien Mercier", "Bernardin de Saint-Pierre",
    "Choderlos de Laclos", "Restif de la Bretonne", "Madame de Genlis",
    "Gabriel Sénac de Meilhan", "Abbé Prévost", "François Gayot de Pitaval",
    "Jean Potocki", "Nicolas Edme Restif de La Bretonne", "Jean-François Marmontel",
    "Pierre Choderlos de Laclos", "Jean-Baptiste Louvet de Couvray", "Jean-Baptiste de Boyer d'Argens",
    
    # 19th century
    "Honoré de Balzac", "Victor Hugo", "Alexandre Dumas", "Gustave Flaubert", "Émile Zola",
    "Stendhal", "Alfred de Musset", "George Sand", "Jules Verne", "Alphonse Daudet",
    "Théophile Gautier", "Charles Baudelaire", "Théodore de Banville", "Edmond de Goncourt",
    "Joris-Karl Huysmans", "Octave Mirbeau", "Félicien Champsaur", "Gustave Aimard",
    "Prosper Mérimée", "Paul Féval", "Eugène Sue", "Félicité de La Mennais", "Charles Nodier",
    "Barbey d'Aurevilly", "Léon Bloy", "Georges Ohnet", "Paul de Kock",
    "Jules Barbey d'Aurevilly", "Gaston Leroux", "Édouard Rod"
]

# Create a DataFrame
df = pd.DataFrame(authors, columns=["Authors"])

# Ajouter une colonne pour la période
df["Period"] = (
    ["17th century"] * 10 +
    ["18th century"] * 20 +
    ["19th century"] * 30
)

# Afficher le DataFrame
print(df)


                                Authors        Period
0                         Honoré d'Urfé  17th century
1                  Madeleine de Scudéry  17th century
2                          Paul Scarron  17th century
3                   Jean de La Fontaine  17th century
4                   Madame de Lafayette  17th century
5                         Charles Sorel  17th century
6   Gauthier de Costes de La Calprenède  17th century
7                       Vincent Voiture  17th century
8                     Jean-Pierre Camus  17th century
9              Marie-Catherine d'Aulnoy  17th century
10                          Montesquieu  18th century
11                             Voltaire  18th century
12                Jean-Jacques Rousseau  18th century
13                        Denis Diderot  18th century
14                             Marivaux  18th century
15                     Claude Crébillon  18th century
16              Louis-Sébastien Mercier  18th century
17            Bernardin de S

In [3]:
import requests

url_api = "https://openlibrary.org/search.json?"

# Function to get the title of the books and the themes
def get_random_book(author):
    url  = url_api + f'q=author:{author}' + '&fields=title,first_publish_year,subject'
    req = requests.get(url)
    # Check if the request worked
    if req.status_code == 200:
        books = req.json().get("docs", [])
        # Select only books for which themes are defined in the API and for which the number of themes are above 5 to avoid irrelevant book descriptions and for which the publication dates back before 1900
        books_with_themes = [book for book in books if book.get("subject") and len(book["subject"]) >= 5 and book.get("first_publish_year") and book.get("first_publish_year") <= 1900] # book.get("first_publish_year") to avoid NoneType errors
        num_books = len(books_with_themes)
        if num_books > 0:
            # Si l'auteur a au moins 3 livres, on en sélectionne 3
            if num_books >= 5:
                selected_books = books_with_themes[:5]
            elif num_books == 4 : 
                selected_books = books_with_themes[:4]
            elif num_books == 3 : 
                selected_books = books_with_themes[:3]
            elif num_books == 2:
                selected_books = books_with_themes[:2]
            elif num_books == 1:
                selected_books = books_with_themes
            book_info = []
            for info in selected_books:
                title = info.get("title", "N/A")
                themes = info.get("subject", "N/A")
                year = info.get("first_publish_year", "N/A")
                book_info.append((author, title, year, themes))
            return book_info
        else:
            return [(author, '', '','')]
    else:
        return f"The request for {author} failed"
    




In [None]:
list_books = []
for author in df["Authors"]:
    list_books.append(get_random_book(author))

print(list_books)


[[("Honoré d'Urfé", '', '', '')], [('Madeleine de Scudéry', 'Artamène', 1691, ['Court and courtiers', 'Fiction', 'Social conditions', 'Social life and customs', 'Women', 'Fiction, general', 'France, fiction'])], [('Paul Scarron', 'Svr la conference de Rvel en mars', 1649, ['Fronde', 'Poetry', 'History', 'France. 1649 March 11', 'France'])], [('Jean de La Fontaine', 'Fables', 1678, ['French Fables', 'Translations into English', 'Fables', 'Oversize books', 'Illustrations', 'Adaptations', 'Translations into Malagasy', 'Specimens', "Aesop's fables", 'English Fables', 'French language', 'Translations into French Creole', 'Readers', 'Poetry', 'French language materials', 'Translations into Yiddish', 'Juvenile literature', 'Translations from French', 'Translations into Arabic', 'Translations into Esperanto', 'Translations into Occitan', 'Toy and movable books', 'Dialects', 'Latin language', 'Yiddish Fables', 'Translations into Italian', 'Translations', 'Translations into Vietnamese', 'Transla

In [None]:
# As we have a : list[list[tuples]], we convert it to list[tuples] to transform it to a df afterward
flattened_books = []
for author_data in list_books:
    for book in author_data:
        flattened_books.append(book)

# Convertir en DataFrame
df_books = pd.DataFrame(flattened_books, columns=['Author', 'Title', 'Year', 'Themes'])


df_books


Unnamed: 0,Author,Title,Year,Themes
0,Honoré d'Urfé,,,
1,Madeleine de Scudéry,Artamène,1691,"[Court and courtiers, Fiction, Social conditio..."
2,Paul Scarron,Svr la conference de Rvel en mars,1649,"[Fronde, Poetry, History, France. 1649 March 1..."
3,Jean de La Fontaine,Fables,1678,"[French Fables, Translations into English, Fab..."
4,Jean de La Fontaine,Contes et nouvelles en vers,1685,"[Translations into English, Translations into ..."
...,...,...,...,...
144,Georges Ohnet,,,
145,Paul de Kock,Oeuvres,1864,"[Description and travel, Travel, History, Libr..."
146,Jules Barbey d'Aurevilly,,,
147,Gaston Leroux,,,


In [None]:
# delete lines of the DF for which no book was found
df_books = df_books[df_books['Title'] != '']
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"[Court and courtiers, Fiction, Social conditio..."
1,Paul Scarron,Svr la conference de Rvel en mars,1649,"[Fronde, Poetry, History, France. 1649 March 1..."
2,Jean de La Fontaine,Fables,1678,"[French Fables, Translations into English, Fab..."
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"[Translations into English, Translations into ..."
4,Charles Sorel,De la connoissance des bons livres,1671,"[French language, French literature, History, ..."
...,...,...,...,...
129,Barbey d'Aurevilly,Les bas-bleus,1878,"[History and criticism, French literature, Lit..."
130,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,"[French Philosophy, French Religious literatur..."
131,Barbey d'Aurevilly,Les poètes,1889,"[French Poets, French poetry, Histoire et crit..."
132,Barbey d'Aurevilly,Les vieilles actrices,1884,"[Actresses, Actresses, French, Authors, French..."


In [None]:
# Transform the list of themes in a string
df_books['Themes'] = df_books['Themes'].apply(lambda x: ', '.join(x))
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"Court and courtiers, Fiction, Social condition..."
1,Paul Scarron,Svr la conference de Rvel en mars,1649,"Fronde, Poetry, History, France. 1649 March 11..."
2,Jean de La Fontaine,Fables,1678,"French Fables, Translations into English, Fabl..."
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Translations into English, Translations into O..."
4,Charles Sorel,De la connoissance des bons livres,1671,"French language, French literature, History, H..."
...,...,...,...,...
129,Barbey d'Aurevilly,Les bas-bleus,1878,"History and criticism, French literature, Litt..."
130,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,"French Philosophy, French Religious literature..."
131,Barbey d'Aurevilly,Les poètes,1889,"French Poets, French poetry, Histoire et criti..."
132,Barbey d'Aurevilly,Les vieilles actrices,1884,"Actresses, Actresses, French, Authors, French,..."


In [65]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
!pip install python-Levenshtein
#erreur sans ce package /opt/conda/lib/python3.12/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')



Collecting python-Levenshtein
  Downloading python_Levenshtein-0.26.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.26.1 (from python-Levenshtein)
  Downloading levenshtein-0.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.1->python-Levenshtein)
  Downloading rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading python_Levenshtein-0.26.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (162 kB)
Downloading rapidfuzz-3.10.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.26.1 python-Levenshtein-0.26.1 rapidfuzz-3.10

In [4]:
import time


In [None]:

from urllib.parse import quote
from langdetect import detect

url = "https://lingva.ml/api/v1/"
source = 'auto/' # auto as some themes may be written in another language than English
dest = 'fr/'
for i in range(len(df_books['Title'])):
    if detect(df_books['Title'][i]) != 'fr':
        url_api_transl = url + source + dest + quote(df_books['Title'][i])
        response = requests.get(url_api_transl)
        translated_themes = response.json().get("translation", "")
        print(df_books['Title'][i], '/' , translated_themes)
        df_books.loc[i,'Title'] = translated_themes
        time.sleep(2) # The API limit the flow of request


Artamène / Artamène
Svr la conference de Rvel en mars / Conférence de Svr Rvel sur Mars
The Prince of Carency / Le Prince de Carency
The spirit of laws, including d'Alembert's analysis of the work / L'esprit des lois, y compris l'analyse de l'ouvrage par d'Alembert
A view of the English constitution / Une vue de la constitution anglaise
Candide / Candide
Du contrat social / Du contrat social
Emile or Education / Emile ou l'éducation
Religieuse / Religieuse
Paul and Virginia / Paul et Virginie
Studies of nature / Etudes sur la nature
Etudes de la nature / Etudes de la nature
Quatrevingt-treize / Quatrevingt-treize
Hernani / Hernani
Ruy Blas / Ruy Blas
Vingt ans apres̀ / Vingt ans apres̀
Madame Bovary / Madame Bovary
Salammbô / Salammbô
Germinal / Germinal
Nana / Nana
La terre / La terre
Cartuja de Parma, La / Chartreuse de Parme, La
Lucien Leuwen / Lucien Louvain
Gamiani, or Two Nights of Excess / Gamiani, ou deux nuits d'excès
La charca del diablo / L'étang du diable
Indiana / Indiana


In [115]:
def remove_similar_books_from_df(df, threshold=80):
    to_remove = []  # Liste pour enregistrer les indices des livres à supprimer

    # Grouper les livres par auteur
    list_books_per_author = df.groupby('Author')['Title'].apply(list)

    # Parcourir chaque groupe d'auteur
    for author, books in list_books_per_author.items():
        # Comparer tous les titres dans chaque groupe
        for i in range(len(books)):
            for j in range(i + 1, len(books)):  # Comparer un titre avec tous les autres après lui
                similarity = fuzz.ratio(books[i], books[j])
                if similarity >= threshold:
                    # Si les titres sont similaires, ajouter l'indice à supprimer
                    index_to_remove = df[(df['Author'] == author) & (df['Title'] == books[j])].index
                    to_remove.extend(index_to_remove)  # Ajouter l'indice à la liste

    # Supprimer les lignes contenant les indices marqués
    df = df.drop(to_remove).reset_index(drop=True)
    
    return df

In [118]:
df_books = remove_similar_books_from_df(df_books)

In [119]:
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"Court and courtiers, Fiction, Social condition..."
1,Paul Scarron,Conférence de Svr Rvel sur Mars,1649,"Fronde, Poetry, History, France. 1649 March 11..."
2,Jean de La Fontaine,Fables,1678,"French Fables, Translations into English, Fabl..."
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Translations into English, Translations into O..."
4,Charles Sorel,De la connoissance des bons livres,1671,"French language, French literature, History, H..."
...,...,...,...,...
119,Barbey d'Aurevilly,Les bas-bleus,1878,"History and criticism, French literature, Litt..."
120,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,"French Philosophy, French Religious literature..."
121,Barbey d'Aurevilly,Les poètes,1889,"French Poets, French poetry, Histoire et criti..."
122,Barbey d'Aurevilly,Les vieilles actrices,1884,"Actresses, Actresses, French, Authors, French,..."


In [112]:
df_books.to_csv('text.csv', index=False)

In [44]:
df_books.to_csv('books0.csv', index=False)

In [113]:
df_books = pd.read_csv('/home/onyxia/work/libroguessr/Data/books0.csv')
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"Court and courtiers, Fiction, Social condition..."
1,Paul Scarron,Svr la conference de Rvel en mars,1649,"Fronde, Poetry, History, France. 1649 March 11..."
2,Jean de La Fontaine,Fables,1678,"French Fables, Translations into English, Fabl..."
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Translations into English, Translations into O..."
4,Charles Sorel,De la connoissance des bons livres,1671,"French language, French literature, History, H..."
...,...,...,...,...
129,Barbey d'Aurevilly,Les bas-bleus,1878,"History and criticism, French literature, Litt..."
130,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,"French Philosophy, French Religious literature..."
131,Barbey d'Aurevilly,Les poètes,1889,"French Poets, French poetry, Histoire et criti..."
132,Barbey d'Aurevilly,Les vieilles actrices,1884,"Actresses, Actresses, French, Authors, French,..."


In [48]:
df_books.sample(20)

Unnamed: 0,Author,Title,Year,Themes
58,Honoré de Balzac,Illusions perdues,1837,"Social life and customs, Translations into Chi..."
88,George Sand,Indiana,1832,"Fiction, Man-woman relationships, Marriage, Wo..."
79,Émile Zola,Au bonheur des dames,1883,"Department stores, Fiction, French fiction, Li..."
50,Jean-Baptiste Louvet de Couvray,Accusation intentée dans la convention nation...,1792,"Trials, litigation, Trials (Political crimes a..."
42,Abbé Prévost,Histoire générale des voyages ou Nouvelle coll...,1746,"Voyages and travels, Voyages, Description and ..."
118,Félicien Champsaur,Masques modernes,1889,"Social life and customs, Popular culture, Thea..."
7,Montesquieu,De l'esprit des lois,1748,"Derecho, Filosofía, Jurisprudence, great_books..."
72,Gustave Flaubert,Salammbô,1863,"Fiction, History, Continental european fiction..."
32,Bernardin de Saint-Pierre,Voyage à l'île de France,1775,"Early works to 1800, Description and travel, F..."
71,Gustave Flaubert,L'Éducation sentimentale,1898,"Fiction, Young men, History, Married women, Un..."


In [120]:
# The goal is now to clean the themes to only keep relevant themes, and to translate them into French
# To translate themes, we use the API Lingva

url = "https://lingva.ml/api/v1/"
source = 'auto/' # auto as some themes may be written in another language than English
dest = 'fr/'

for i in range(len(df_books['Themes'])):
    if detect(df_books['Themes'][i]) != 'fr' :
        url_api_transl = url + source + dest + df_books['Themes'][i]
        response = requests.get(url_api_transl)
        if response.status_code == 200:
            translated_themes = response.json().get("translation", "")
            df_books.loc[i,'Themes'] = translated_themes
        time.sleep(2)

In [122]:
df_books.sample(20)


Unnamed: 0,Author,Title,Year,Themes
82,George Sand,Indiana,1832,"Fiction, Relations homme-femme, Mariage, Femme..."
84,George Sand,Lucrèce Floriani,1846,"Littérature classique, fiction, fiction britan..."
28,Bernardin de Saint-Pierre,Voyage à l'île de France,1775,"Travaux de jeunesse jusqu'à 1800, Description ..."
101,Edmond de Goncourt,Journal,1887,"Journaux, Vie sociale et mœurs, Vie intellectu..."
63,Alexandre Dumas,La dame aux camélias [novel],1848,"Littérature classique, Fiction, OverDrive, Cou..."
67,Gustave Flaubert,Trois contes,1877,"Littérature classique, Fiction, Langue françai..."
86,Jules Verne,Le Tour du Monde en Quatre-Vingts Jours,1872,"Voyages autour du monde, Traductions en gujara..."
26,Louis-Sébastien Mercier,Du théâtre,1773,"Drama, French drama, Théâtre, Théâtre (Genre l..."
102,Edmond de Goncourt,La femme au dix-huitieme siecle,1862,"Questions sociales et morales, Femmes, Salons,..."
25,Louis-Sébastien Mercier,Le nouveau Paris,1798,"Récits personnels, Histoire, Influence, Récits..."


In [6]:
df_books.to_csv('books1.csv', index=False)

NameError: name 'df_books' is not defined

In [82]:
df_books = pd.read_csv('/home/onyxia/work/libroguessr/Data/books1.csv')
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"Cour et courtisans, Fiction, Conditions social..."
1,Paul Scarron,Conférence de Svr Rvel sur Mars,1649,"Fronde, Poésie, Histoire, France. 1649 11 mars..."
2,Jean de La Fontaine,Fables,1678,"Fables françaises, Traductions en anglais, Fab..."
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Traductions en anglais, Traductions en occitan..."
4,Charles Sorel,De la connoissance des bons livres,1671,"Langue française, Littérature française, Histo..."
...,...,...,...,...
119,Barbey d'Aurevilly,Les bas-bleus,1878,"History and criticism, French literature, Litt..."
120,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,"French Philosophy, French Religious literature..."
121,Barbey d'Aurevilly,Les poètes,1889,"French Poets, French poetry, Histoire et criti..."
122,Barbey d'Aurevilly,Les vieilles actrices,1884,"Actrices, Actrices, Françaises, Auteurs, Franç..."


In [51]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993222 sha256=08c4b253689c442c6442151bdbb46a012ffb439f937b9bd4e5d1912d328f39ef
  Stored in directory: /home/onyxia/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [None]:
#TODO : enlever les mots en trop dans les titres : ex: 67 : Dumas
# The goal is now to clean the themes to only keep relevant themes, and to translate them into French



46

In [83]:
df_books['Themes'][121]

'French Poets, French poetry, Histoire et critique, History and criticism, Poets, French, Poésie française'

In [84]:
#Map themes in order to clean themes in the df 
themes_mapping = {
    "Amour": ["Fiction", "Relations homme-femme", "Amour", "Séduction"],
    "Héroïsme": ["Fiction", "Héroïsme", "Héroïsme"],
    "Morale": ["Conditions sociales", "Conduite de vie", "Morale", "Conscience (Morale)"],
    "Intrigues": ["Fiction", "Intrigues"],
    "Aventures": ["Aventures", "Voyages", "Récits d'aventures"],
    "Critique": ["Critique et interprétation", "Histoire et critique", "Critique", "Philosophie", "Satire"],
    "Liberté": ["Liberté", "Égalité", "Liberté religieuse", "Liberté"],
    "Éducation": ["Éducation", "Éducation", "Ouvrages de jeunesse jusqu'à 1800"],
    "Voyages": ["Voyages", "Voyages imaginaires", "Voyages, imaginary", "Voyages"],
    "Nature": ["Nature", "Histoire naturelle", "La nature dans la littérature", "Aspects religieux de la nature"],
    "Ironie": ["Ironie", "Satire", "Humour"],
    "Passion": ["Passion", "Émotions", "Vie sociale et coutumes"],
    "Mélancolie": ["Mélancolie", "Tristesse"],
    "Fantastique": ["Fantastique", "Fiction fantastique", "Imaginaire"],
    "Mœurs": ["Mœurs et coutumes", "Vie sociale et mœurs", "Mœurs et usages"],
    "Ambitions": ["Ambitions", "Luxe", "Richesse", "Pouvoir"],
    "Pouvoir": ["Pouvoir", "Droit", "Politique et gouvernement"],
    "Argent": ["Argent", "Finance", "Luxe", "Richesse"],
    "Hérédité": ["Généalogie", "Biographie", "Histoire familiale"],
    "Inégalités": ["Inégalités", "Sociale"],
    "Instincts": ["Instincts", "Comportements"],
    "Déclin": ["Déclin", "Chute"],
    "Oppression": ["Oppression", "Répression"],
    "Histoire": ["Histoire", "Historique", "Historique et critique","Société", "Évolutions sociales", "Révolution française", "Histoire", "Politique", "Révolutions"],
    "Religion": ["Religion", "Christianisme", "Religieuses","Jésuites", "Histoire de l'Église", "Église catholique", "Œuvres apologétiques"],
    "Coutumes": ["Coutumes", "Mœurs et coutumes"],
    "Biographie": ["Biographie", "Autobiographie","Fiction biographique"],
    "Esthétique": ["Esthétique", "Art"],
    "Décadence": ["Décadence", "Chute"],
    "Théâtre": ["Théâtre", "Drame", "Théâtre en vers"],
    "Roman": ["Roman", "Fiction", "Roman historique"],
    "Drame": ["Drame", "Fiction", "Théâtre"],
    "Philosophie": ["Philosophie", "Philosophie moderne"],
    "Société": ["Société", "Conditions sociales", "Structure sociale", "Mœurs"],
    "Récits de guerre": ["Récits de guerre", "Conflits", "Histoire militaire"],
    "Nature et environnement": ["Nature", "Écologie", "Environnement", "Conservation"],
    "Techniques": ["Technologie", "Science", "Innovations"],
    "Mythes et légendes": ["Mythes", "Légendes", "Fables", "Folklore"],
    "Épouvante": ["Épouvante", "Horreur", "Suspense", "Thriller"],
    "Justice": ["Justice", "Loi", "Droits", "Éthique"],
    "Réalité et illusion": ["Réalité", "Illusion", "Perception", "L'irréel"],
    "Poésie": ["Poésie", "Poésie française", "Poésie", "Poésie française", "Littérature poétique", "Matériaux en langue française"],
    "Critique d'art": ["Critique d'art", "Histoire", "Art moderne", "Peinture", "Critique d'art", "Arts modernes"],
    "Littérature fantastique": ["Littérature fantastique", "Critique et interprétation", "Fiction fantastique", "Arts"],
    "Décadence": ["Décadence", "Chute", "Esthétique", "Fiction"],
    "Utopie": ["Utopie", "Littérature utopique", "Fiction spéculative", "Socialisme utopique"],
    "Luxe et débauche": ["Luxe et débauche", "Fiction", "Vie de cour", "Aristocratie", "Luxe", "Critique sociale"]
}


In [85]:
# Fonction de mapping des thèmes
def map_themes(row):
    # Diviser les thèmes par virgule
    themes = row.split(", ")
    mapped_themes = []
    
    # Vérifier et mapper chaque thème
    for theme in themes:
        # Chercher le thème dans le dictionnaire
        for key, values in themes_mapping.items():
            if theme in values:
                mapped_themes.append(key)
                break
    
    # Retourner les thèmes mappés sous forme de chaîne
    return ", ".join(mapped_themes)

# Appliquer la fonction de mapping sur la colonne "Themes"
df_books["Themes"] = df_books["Themes"].apply(map_themes)


In [86]:
df_books.sample(20)

Unnamed: 0,Author,Title,Year,Themes
58,Victor Hugo,Ruy Blas,1838,Amour
25,Louis-Sébastien Mercier,Le nouveau Paris,1798,"Histoire, Pouvoir, Passion"
65,Gustave Flaubert,L'Éducation sentimentale,1898,"Amour, Histoire, Amour, Passion, Mœurs, Amour,..."
26,Louis-Sébastien Mercier,Du théâtre,1773,Théâtre
117,Charles Nodier,Bibliothèque sacrée grecque-latine,1826,Hérédité
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Amour, Poésie, Mythes et légendes"
15,Jean-Jacques Rousseau,Du contrat social,1762,"Amour, Critique, Liberté, Liberté, Pouvoir"
97,Charles Baudelaire,Essais,1890,"Critique, Esthétique, Critique d'art, Critique..."
98,Charles Baudelaire,,1857,
122,Barbey d'Aurevilly,Les vieilles actrices,1884,


In [87]:
# Remove empty lines of the df as iit means that themes obtained in the API where not relevant for our analysis
df_books = df_books[df_books['Themes'] != '']


In [88]:
df_books.reset_index(drop=True, inplace=True)

In [89]:
df_books

Unnamed: 0,Author,Title,Year,Themes
0,Madeleine de Scudéry,Artamène,1691,"Amour, Morale, Mœurs, Amour"
1,Paul Scarron,Conférence de Svr Rvel sur Mars,1649,"Poésie, Histoire"
2,Jean de La Fontaine,Fables,1678,"Mythes et légendes, Poésie, Amour, Amour"
3,Jean de La Fontaine,Contes et nouvelles en vers,1685,"Amour, Poésie, Mythes et légendes"
4,Charles Sorel,De la connoissance des bons livres,1671,"Histoire, Critique"
...,...,...,...,...
110,Charles Nodier,Mélanges tirés d'une petite bibliothèque,1829,Critique
111,Barbey d'Aurevilly,Les bas-bleus,1878,Critique
112,Barbey d'Aurevilly,Les philosophes et les écrivains religieux,1860,Critique
113,Barbey d'Aurevilly,Les poètes,1889,"Critique, Poésie"


In [90]:
# Remove duplicated themes
df_books['Themes'] = df_books['Themes'].apply(lambda x: set(x.split(", ")) if isinstance(x, str) else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_books['Themes'] = df_books['Themes'].apply(lambda x: set(x.split(", ")) if isinstance(x, str) else x)


In [None]:
# Transform the list of themes in a string
df_books['Themes'] = df_books['Themes'].apply(lambda x: ", ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_books['Themes'] = df_books['Themes'].apply(lambda x: ", ".join(x))


In [105]:
df_books['Themes'][15]

'Liberté, Amour, Critique, Pouvoir'

In [108]:
df_books.sample(15)

Unnamed: 0,Author,Title,Year,Themes
64,Gustave Flaubert,Bouvard et Pécuchet,1896,"Amour, Critique"
78,George Sand,Histoire de ma vie,1854,"Amour, Hérédité"
5,Marie-Catherine d'Aulnoy,Relation du voyage d'Espagne,1691,"Mœurs, Histoire, Voyages, Aventures"
85,Théophile Gautier,Mademoiselle de Maupin,1834,Amour
73,Stendhal,Lucien Louvain,1894,"Amour, Pouvoir, Passion"
62,Gustave Flaubert,Salammbô,1863,"Histoire, Amour"
79,Jules Verne,Voyage au Centre de la Terre,1867,"Voyages, Amour, Aventures, Fantastique, Épouvante"
75,Alfred de Musset,"Gamiani, ou deux nuits d'excès",1800,Amour
104,Prosper Mérimée,Nouvelles,1874,Amour
109,Charles Nodier,Bibliothèque sacrée grecque-latine,1826,Hérédité


In [109]:
df_books.to_csv('books2.csv', index=False)