#Trouver les top 100 pollueurs

**Liste des top 100 pollueurs** https://www.theguardian.com/sustainable-business/2017/jul/10/100-fossil-fuel-companies-investors-responsible-71-global-emissions-cdp-study-climate-change?CMP=share_btn_tw

#Extraction des informations du top 100 sur Wikidata

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import csv

pollueurs = {}

with open('/Data/Top_pollueurs.csv', newline='') as csvfile:
    lecteur_csv = csv.reader(csvfile)
    for row in lecteur_csv:
        pollueurs[row[0]] = None

print(pollueurs)

À partir de notre liste de pollueurs, on cherche leur ID Wikidata.

In [None]:
import requests

def search_entity(name):
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={name}&language=en&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        search_results = data.get("search", [])
        if search_results:
            return search_results[0].get("id")
    return None


polluters_w_id = {}
for entity_name in pollueurs.keys():
    entity_id = search_entity(entity_name)
    if entity_id:
        polluters_w_id[entity_name] = {"entity_id": entity_id}

print(polluters_w_id)


On cherche ensuite les informations stockées par Wikidata pour chaque pollueurs restants.

In [None]:
import json

def get_entity_info(entity_id):
    url = f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={entity_id}&format=json&languages=en"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get("entities", {}).get(entity_id, {})
    else:
        return None

with open('/Data/polluters_with_id.json', 'w') as f:
    json.dump(polluters_w_id, f)

with open('/Data/polluters_with_id.json', 'r') as f:
    polluters_w_id = json.load(f)

for key, value in polluters_w_id.items():
    entity_id = value['entity_id']
    entity_info = get_entity_info(entity_id)
    value.update(entity_info)

with open('/Data/polluters_with_info.json', 'w') as f:
    json.dump(polluters_w_id, f)

On termine en nettoyant le fichier pour le rendre plus lisible.

In [None]:
with open('/Data/polluters_with_info.json', 'r') as f:
    polluters_with_info = json.load(f)

for entity_info in polluters_with_info.values():
    keys_to_remove = ['pageid', 'ns', 'title', 'lastrevid', 'modified', 'type', 'id', 'labels', 'descriptions', 'aliases', 'claims', 'sitelinks']
    for key in keys_to_remove:
        entity_info.pop(key, None)

for entity, data in polluters_with_info.items():
    if 'entity_info' in data:
        data['entity_info'].pop('sitelinks', None)

with open('/Data/polluters_with_info.json', 'w') as f:
    json.dump(polluters_with_info, f)

Après cela il ne reste plus que 54 pollueurs pour lesquels nous avons des informations complémentaires sur leurs activités.

###Conversion du json en csv pour pouvoir créer un graphe

In [None]:
import json
import csv
import requests

with open('Data/polluters_with_info.json', 'r') as json_file:
    data = json.load(json_file)

with open('Data/polluters_with_info.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)

    #nous gardons seulement quelques informations : rang, entreprise, id wikidata, pays...
    header = ["Rank", "Company", "Id", "Country", "Headquarters", "Revenue", "Product", "Stock Exchange"]
    max_aliases_per_company = 7
    for i in range(max_aliases_per_company):
        header.append(f"Alias {i+1}")
    max_sectors_per_company = 4
    for i in range(max_sectors_per_company):
        header.append(f"Sector {i+1}")
    max_part_of_per_company = 5
    for i in range(max_part_of_per_company):
        header.append(f"Part of {i+1}")

    writer.writerow(header)

    rank = 1

    #Maintenant, nous cherchons chaque donnée là où elle se situe, en parcourant claims, qui contient toutes les informations sur le pollueur. Les clés nous permettent d'obtenir
    #chaque information. Par exemple, la clé pour le pays est P17

    claims = entity_info.get("claims", "")

    for company_name, company_data in data.items():
        entity_info = company_data.get("entity_info", {})

        id = entity_info.get("title", "")

        entity_aliases = []
        aliases_info = entity_info.get("aliases", {})
        for lang, aliases_list in aliases_info.items():
            for alias in aliases_list:
                entity_aliases.append(alias.get("value", ""))
        while len(entity_aliases) < max_aliases_per_company:
            entity_aliases.append("")



        #On cherche le pays du pollueur
        state = claims.get("P17", "")
        if isinstance(state, list):
            country = state[0].get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", "")
        else:
            country = ""

        #On cherche le(s) secteur d'activité du pollueur
        sector = claims.get("P452", "")
        sector_list = []
        if isinstance(sector, list):
            for s in sector:
                sector_list.append(s.get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", ""))
        else:
            sector_list.append(sector)
        while len(sector_list) < max_sectors_per_company:
            sector_list.append("")

        #On cherche le chiffre d'affaire du pollueur
        revenue = claims.get("P2139", "")
        if isinstance(revenue, list):
            revenue = revenue[0].get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("amount", "")
        else:
            revenue = ""

        #On ajoute la date du revenu
        date = claims.get("P2139", "")
        if isinstance(date, list):
            date = date[0].get("qualifiers", {}).get("P585", [{}])[0].get("datavalue", {}).get("value", {}).get("time", "")
        else:
            date = ""

        revenue = f"{revenue} ({date})"

        #Lieu du siège
        headquarters = claims.get("P159", "")
        if isinstance(headquarters, list):
            headquarters = headquarters[0].get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", "")
        else:
            headquarters = ""

        #Produit, service ou activité principale
        product = claims.get("P1056", "")
        if isinstance(product, list):
            product = product[0].get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", "")
        else:
            product = ""

        #Fait partie de ...
        part_of = claims.get("P361", "")
        part_of_list = []
        if isinstance(part_of, list):
            for p in part_of:
                part_of_list.append(p.get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", ""))
        else:
            part_of_list.append(part_of)
        while len(part_of_list) < max_part_of_per_company:
            part_of_list.append("")

        stock_exchange = claims.get("P414", "")
        if isinstance(stock_exchange, list):
            stock_exchange = stock_exchange[0].get("mainsnak", {}).get("datavalue", {}).get("value", {}).get("id", "")
        else:
            stock_exchange = ""


        row = [
            rank,
            company_name,
            id,
            country,
            headquarters,
            revenue,
            product,
            stock_exchange
        ] + entity_aliases + sector_list + part_of_list
        writer.writerow(row)

        rank += 1

FileNotFoundError: [Errno 2] No such file or directory: 'Data/polluters_with_info.json'

Un dernière étape pour avoir toutes les informations est la conversion des id en entités par recherche dans wikidata. En effet, nous avons un csv dont les informations ne sont pas complètes.
Exemple avec les deux premières lignes :

Rank, Company, Id, Country, Headquarters...

1, National Iranian Oil Co, Q593733, Q794, Q3616...

Il reste alors à chercher les entités dans Wikidata

In [None]:
import csv
import requests

def get_wikidata_entity(entity_id):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": entity_id,
        "format": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()

    if "entities" in data and entity_id in data["entities"]:
        entity = data["entities"][entity_id]
        return entity
    else:
        return None

def get_entity_name(entity_id):  #nous créons une fonction qui permet de retrouver une entité avec l'id
    if entity_id == "":
        return ""
    entity = get_wikidata_entity(entity_id)
    if isinstance(entity, dict):
        labels = entity.get("labels", {})
        if "en" in labels:
            return labels["en"]["value"]
    return ""

with open('Data/polluters_with_info_more_info.csv', 'r', encoding='utf-8') as csv_file:
    reader = csv.reader(csv_file)
    next(reader)

    with open('Data/polluters_with_info_more_info_with_names.csv', 'w', newline='', encoding='utf-8') as output_file:
        writer = csv.writer(output_file)

        writer.writerow(['Rank', 'Company', 'Id', 'Country', 'Headquarters', 'Revenue', 'Product', 'Stock Exchange', 'Alias 1', 'Alias 2', 'Alias 3', 'Alias 4', 'Alias 5', 'Alias 6', 'Alias 7', 'Sector 1', 'Sector 2', 'Sector 3', 'Sector 4', 'Part of 1', 'Part of 2', 'Part of 3', 'Part of 4', 'Part of 5'])

        for row in reader:
            rank = row[0]
            company = row[1]
            entity_id = row[2]
            country_id = row[3]
            headquarters_id = row[4]
            revenue = row[5]
            product_id = row[6]
            stock_exchange_id = row[7]
            alias_1 = row[8]
            alias_2 = row[9]
            alias_3 = row[10]
            alias_4 = row[11]
            alias_5 = row[12]
            alias_6 = row[13]
            alias_7 = row[14]
            sector_1_id = row[15]
            sector_2_id = row[16]
            sector_3_id = row[17]
            sector_4_id = row[18]
            part_of_1_id = row[19]
            part_of_2_id = row[20]
            part_of_3_id = row[21]
            part_of_4_id = row[22]
            part_of_5_id = row[23]

            #nous cherchons toutes les entités et réécrivons un csv avec les noms
            country_name = get_entity_name(country_id)
            headquarters_name = get_entity_name(headquarters_id)
            product = get_entity_name(product_id)
            stock_exchange = get_entity_name(stock_exchange_id)
            sector_1_name = get_entity_name(sector_1_id)
            sector_2_name = get_entity_name(sector_2_id)
            sector_3_name = get_entity_name(sector_3_id)
            sector_4_name = get_entity_name(sector_4_id)
            part_of_1_name = get_entity_name(part_of_1_id)
            part_of_2_name = get_entity_name(part_of_2_id)
            part_of_3_name = get_entity_name(part_of_3_id)
            part_of_4_name = get_entity_name(part_of_4_id)
            part_of_5_name = get_entity_name(part_of_5_id)

            writer.writerow([rank, company, entity_id, country_name, headquarters_name, revenue, product, stock_exchange, alias_1, alias_2, alias_3, alias_4, alias_5, alias_6, alias_7, sector_1_name, sector_2_name, sector_3_name, sector_4_name, part_of_1_name, part_of_2_name, part_of_3_name, part_of_4_name, part_of_5_name])

On obtient un csv contenant les 54 pollueurs avec leurs infos : pays, siège, revenu (derniere année où on a l'info), bourse des valeurs (lieu d'échange des valeurs mobilières), alias, secteurs d'activité, groupes dont fait partie l'entreprise

#Création de graphe à partir des pollueurs

###Création des noeuds

Nous avons créé des noeuds à partir du csv. Pour cela, nous avons écrit un csv nommé nodes.csv qui contient trois colonnes, comme nous l'avons vu en TP : id, label, type.

In [None]:
import csv

with open('Data/polluters_with_info_more_info_with_names.csv', 'r', encoding='utf-8') as input_file:
    reader = csv.reader(input_file)
    with open('Data/nodes.csv', 'w', encoding='utf-8', newline='') as output_file:
        used_labels = []
        csv_writer_hq = csv.writer(output_file)
        csv_writer_hq.writerow(['Id', 'Label', 'Type'])
        next(reader)
        for row in reader:
            for i in range(1, 24):
                #On parcourt toutes les colonnes. En fonction de la colonne dans laquelle on se trouve, le type change. Pour les id, nous avons seulement rajouté un préfixe
                #au nom de l'entreprise, ce qui fait bien un clé primaire
                if row[i] == '':
                    continue
                if i == 1:
                    id = f'c{row[i]}'
                    label = row[i]
                    type = 'Company'
                elif i == 3 and row[i] != '':
                    id = f'country{row[i]}'
                    label = row[i]
                    type = 'Country'
                elif i == 4 and row[i] != '':
                    id = f'h{row[i]}'
                    label = row[i]
                    type = 'Headquarter'
                elif i == 5 and row[i] != '':
                    id = f'd{row[i]}'
                    label = row[i]
                    type = 'Revenue'
                elif i == 6 and row[i] != '':
                    id = f'p{row[i]}'
                    label = row[i]
                    type = 'Product'
                elif i == 7 and row[i] != '':
                    id = f's{row[i]}'
                    label = row[i]
                    type = 'Stock Exchange Sector'
                elif 15 <= i <= 18 and row[i] != '':
                    id = f'sector{row[i]}'
                    label = row[i]
                    type = 'Activity Sector'
                elif 19 <= i <= 23 and row[i] != '':
                    id = f'group{row[i]}'
                    label = row[i]
                    type = 'Group'
                else:
                    continue
                if label not in used_labels:
                    csv_writer_hq.writerow([id, label, type])
                    used_labels.append(label)

###Créations des arcs

Enfin, nous avons créé des arc entre chaque entreprise et ses différentes caractéristiques. Pour cela, nous avons écrit un csv nommé edges.csv, qui contient trois colonnes, comme vu en TP : Source, Target, Label.

In [None]:
import csv

with open('Data/polluters_with_info_more_info_with_names.csv', 'r', encoding='utf-8') as input_file:
    reader = csv.reader(input_file)
    with open('Data/edges.csv', 'w', encoding='utf-8', newline='') as output_file:
        csv_writer_hq = csv.writer(output_file)
        csv_writer_hq.writerow(['Source', 'Target', 'Label'])
        next(reader)
        for row in reader:
            for i in range(2, 24):
                if row[i] == '': #La source est toujours l'entreprise
                    continue
                source = f'c{row[1]}'

                #Sinon, on cherche target, qu'on retrouve grace aux clés primaires crées, et le label sera différent en fonction de target.

                if i == 3 and row[i] != '':
                    target = f'country{row[i]}'
                    label = 'country'
                elif i == 4 and row[i] != '':
                    target = f'h{row[i]}'
                    label = 'headquarter'
                elif i == 5 and row[i] != '':
                    target = f'd{row[i]}'
                    label = 'revenue'
                elif i == 6 and row[i] != '':
                    target = f'p{row[i]}'
                    label = 'product'
                elif i == 7 and row[i] != '':
                    target = f's{row[i]}'
                    label = 'stock exchange sector'
                elif 15 <= i <= 18 and row[i] != '':
                    target = f'sector{row[i]}'
                    label = 'activity sector'
                elif 19 <= i <= 23 and row[i] != '':
                    target = f'group{row[i]}'
                    label = 'group'
                else :
                    continue
                csv_writer_hq.writerow([source, target, label])

FileNotFoundError: [Errno 2] No such file or directory: 'Data/polluters_with_info_more_info_with_names.csv'