In [32]:
import pandas as pd
import wikipediaapi
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import json
from sklearn.datasets import load_files

## DATA COLLECTION ##########################################################################

In [36]:
def collectBiographiesFromCategories(categorymembers, category, data, number_people, level=0, max_level=1): 
    """
    Recursively collects biographical texts and fact graphs for each person in specified Wikipedia category members up to a given recursion level.

    Parameters:
        categorymembers (dict): A dictionary of category member pages from the WikipediaAPI.
        category (str): The name of the category being processed.
        data (list): A list to store biographical text and associated metadata.
        number_people (int): Counter for the number of biographies collected.
        level (int): Current level of recursion through category members.
        max_level (int): Maximum depth of recursion allowed.

    Returns:
        int: Updated count of processed biographies.
    """

    for page in categorymembers.values():
        if page.ns == 0 and 'List' not in page.title and number_people < 130: #limit the number of people for each category to 130

            #some wikipedia page cannot be found in dbpedia, we ignore them
            try: 

                #knowledge graph of facts
                kg_graph = fetchDbpediaFacts(page.pageid, page.title)
                # Save the graph of facts to a JSON file
                with open('knowledge_graph.json', 'a') as json_file:
                    json.dump(kg_graph, json_file, indent=4)
                print(f"{category} {page.title} processed")


                text = page.text
                data.append({'text': text, 'category': category})
                #creation of txt file for the biography
                f = open(f"Biographies_{category}/{(page.title).replace(' ', '')}_{category}.txt", "w", encoding='utf-8')
                f.write(text)
                f.close()

                #keep track of people processed
                number_people += 1  

            except Exception as err:
                print(f"Unexpected {err=}, {type(err)=}, could not process {page.title}")
                continue


        if page.ns == wikipediaapi.Namespace.CATEGORY and level < max_level and number_people < 130:
            # Recursively call the function to process pages in the subcategory 
            number_people = collectBiographiesFromCategories(page.categorymembers, category, data, number_people, level=level + 1, max_level=max_level)
    # return updated count of processed people
    return number_people

In [37]:
def createBiographyDataFrames():
    """
    Collects biographical data from Wikipedia categories 'Sculptors' and 'Journalists', 
    creates directories for storing these biographies, and formats the collected data into a pandas DataFrame.

    Returns:
        DataFrame: A pandas DataFrame containing the text and category of each biography collected.
    """

    wiki = wikipediaapi.Wikipedia('Mozilla/5.0', 'en')

    data = []
    
    if not os.path.exists('Biographies'):
        os.mkdir('Biographies')
    os.chdir('Biographies')
    
    sculptors = wiki.page("Category:Sculptors")
    if not os.path.exists('Biographies_Sculptors'):
        os.mkdir('Biographies_Sculptors') #creates a directory to store all sculptors' biographies
    collectBiographiesFromCategories(sculptors.categorymembers, "Sculptors", data, 0)
    

    journalists = wiki.page("Category:Journalists")
    if not os.path.exists('Biographies_Journalists'):
        os.mkdir('Biographies_Journalists') #creates a directory to store all journalists' biographies
    collectBiographiesFromCategories(journalists.categorymembers, "Journalists", data, 0)

    df = pd.DataFrame(data)
    return df

In [38]:
def fetchDbpediaFacts(wiki_id, page_title):
    """
    Fetches RDF triples (facts) about a specific Wikipedia page from DBpedia using the page's ID.

    Parameters:
        wiki_id (int): The Wikipedia page ID used to fetch the corresponding DBpedia page.
        page_title (str): Title of the Wikipedia page.

    Returns:
        dict: A dictionary containing RDF triples with DBpedia facts about the page, its title and other metadata.
    """

    #initialize the SPARQL wrapper
    sparql = SPARQLWrapper("http://dbpedia.org/sparql/")

    #first query to retrieve the dbpedia page of the person from the wikipedia page id
    sparql.setQuery(f"""
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX dbr: <http://dbpedia.org/resource/>
            PREFIX dbp: <http://dbpedia.org/property/>

            SELECT *
            WHERE {{
                    ?person dbo:wikiPageID {wiki_id} .
            }}
    """)
    sparql.setReturnFormat(JSON)
    result_link = sparql.query().convert()

    person_wikidata_page = result_link["results"]["bindings"][0]["person"]["value"] #the first value is most likely what we are looking for
    
    #second query to fetch all information as RDF triples
    sparql.setQuery(f"""
            PREFIX dbo: <http://dbpedia.org/ontology/>
            PREFIX dbr: <http://dbpedia.org/resource/>
            PREFIX dbp: <http://dbpedia.org/property/>

            DESCRIBE <{person_wikidata_page}>
    """)
    sparql.setReturnFormat(JSON)
    results_facts = sparql.query().convert()
    results_facts['results']['bindings'] = (results_facts['results']['bindings'][:100]) #first 100 facts
    results_facts['head']['person'] = page_title.replace(" ", "")
    
    return(results_facts)

# around 331 000 lines

## DATA ANALYSIS ############################################################################

In [35]:
"""
def vocabulary():
    #os.chdir('/Users/abigail.berthe/OneDrive/M1_TAL/S2/Data_Science/m1-datascience-project')
    print(os.getcwd())
    # Loading all files in "dir" directory into a pandas dataframe

    DATA_DIR = f"./"
    print(DATA_DIR)
    data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")
    print(data)
    
    # get the index of 'Biographies_Journalists' and 'Biographies_Sculptors'
    indexes = { data['target_names'].index('Biographies_Journalists') : 'Journalists', data['target_names'].index('Biographies_Sculptors') : 'Sculptors'}
    
    df = pd.DataFrame(list(zip(data['data'], [indexes[x] for x in  data['target']])), columns=['text', 'category'])
    print(df.head(10))
    print(len(df))
    
vocabulary()
"""



c:\Users\Etudiant\M1_TAL\m1-datascience-project\src
./
       './Biographies_Journalists\\PantaziGhica_Journalists.txt',
       './Biographies_Sculptors\\JesúsRafaelSoto_Sculptors.txt',
       './Biographies_Journalists\\FreedomofthepressinUkraine_Journalists.txt',
       './Biographies_Sculptors\\KudjoeAffutu_Sculptors.txt',
       './Biographies_Journalists\\OlympeBhely-Quenum_Journalists.txt',
       './Biographies_Journalists\\CharlesG.Halpine_Journalists.txt',
       './Biographies_Sculptors\\AgimRada_Sculptors.txt',
       './Biographies_Journalists\\I.RobertoEisenmannJr._Journalists.txt',
       './Biographies_Sculptors\\CalixteDakpogan_Sculptors.txt',
       './Biographies_Journalists\\JuhaniAho_Journalists.txt',
       './Biographies_Journalists\\EdwardLofley_Journalists.txt',
       './Biographies_Sculptors\\EricAdjeteyAnang_Sculptors.txt',
       './Biographies_Sculptors\\EdvardEriksen_Sculptors.txt',
       './Biographies_Sculptors\\JulioAguilera_Sculptors.txt',
       './B

## CLUSTERING ###############################################################################

## MAIN #####################################################################################

In [39]:
def main():
    print(createBiographyDataFrames())
    #print('hello')
    #vocabulary()

In [40]:
if __name__=='__main__':
    main()

Unexpected err=IndexError('list index out of range'), type(err)=<class 'IndexError'>, could not process Margaret Trowell




Sculptors George Lilanga processed
Sculptors Eric Gill works at the Midland Hotel, Morecambe processed
Sculptors New British Sculpture processed
Sculptors New Sculpture processed
Sculptors Orestes Acquarone processed
Sculptors Georges Adéagbo processed
Sculptors Kudjoe Affutu processed
Sculptors Julio Aguilera processed
Sculptors Adebisi Akanji processed
Sculptors Kwame Akoto-Bamfo processed
Sculptors Sunday Jack Akpan processed
Sculptors Enrique Alférez processed
Sculptors José Simões de Almeida processed
Sculptors Julio Alpuy processed
Sculptors Maximiano Alves processed
Sculptors Eric Adjetey Anang processed
Sculptors El Anatsui processed
Sculptors Mark Antokolsky processed
Sculptors Massoud Arabshahi processed
Sculptors Carmelo Arden Quin processed
Sculptors Pablo Atchugarry processed
Sculptors Kevin Atherton processed
Sculptors Armando de Basto processed
Sculptors Alberto Bautista Gómez processed
Sculptors José Belloni processed
Sculptors Stelio Belloni processed
Sculptors Ahron B