In [1]:
%load_ext autoreload
%autoreload 2

from owlready2 import *
import owlready2
print(owlready2.VERSION)

import pandas as pd
import glob, os, hashlib
import requests, json
from datetime import datetime

import random

def NOW():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return "Current Time = "+ str(current_time)

from dotenv import load_dotenv
load_dotenv('.env')

URL = os.getenv("KG_URL_FCT")
TOK3N = os.getenv("KG_TOKEN")

NBCAP = 5000



0.40


In [2]:
import importlib.util
import sys

spec = importlib.util.spec_from_file_location("llm", "../../../utils/llm.py")
h = importlib.util.module_from_spec(spec)
sys.modules["llm"] = h
spec.loader.exec_module(h)

spec = importlib.util.spec_from_file_location("rdfutils", "../../../utils/rdfutils.py")
u = importlib.util.module_from_spec(spec)
sys.modules["rdfutils"] = u
spec.loader.exec_module(u)

In [3]:
onto = get_ontology("./WIP_2.owl").load()
u.cOnto(onto)

# Adding classifications

## Benefits


In [None]:
BENEFRETURNS = [x.label[0] for x in onto.BenefReturn.instances() if len(x.label) >= 1]
len(BENEFRETURNS),len(list(set(BENEFRETURNS)))

(1598, 1598)

In [None]:
x = list(onto.Benef.instances())[0]
x.label,x.has_BenefReturn[0].label

 ['Improved public health response, reduced transmission, better control of outbreaks'])

In [None]:
def classify_benefit(CategoriesOfBenef):
    return [
        {
        "name": "classify_benefits",
        "description": "Function used to classify benefits from a high level perspective.",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_benefits": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of benefit ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The benefit being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":CategoriesOfBenef,
                                "description": 'The category of benefit that matches the benefit being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct benefit category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_benefits"],
        },
    }
]

In [None]:
DICT = {'Political': [
    "Improved Governance and Decision-Making",
    "Increased Public Trust and Compliance",
    "Enhanced Policy Development and Implementation",
    "Economic Stability and Investment Attraction",
    "Crisis Management and Public Health Response"
],
 'Infrastructure': [
    "Operational Efficiency and Adaptability",
    "Tenant Satisfaction and Attractiveness",
    "Health and Safety Improvements",
    "Environmental Sustainability and Green Practices",
    "Economic Value and Asset Enhancement",
    "Supply Chain Reliability and Market Access",
    "Innovative Technologies and Research Advancements",
    "Urban Resilience and Community Well-being",
    "Project Management and Construction Excellence",
    "Public Health and Epidemiological Contributions"
],
 'Economic': [
    "Health and Safety Improvements",
    "Asset and Property Value Enhancement",
    "Operational and Supply Chain Efficiency",
    "Economic Stability and Growth",
    "Environmental Sustainability",
    "Innovation and Investment Attraction",
    "Workforce and Workplace Enhancement",
    "Consumer and Market Benefits",
    "Resource Management and Cost Savings",
    "Social and Community Development"
],
 'Legal': [
    "Compliance and Regulatory Adherence",
    "Risk Management and Liability Reduction",
    "Contractual Stability and Fair Compensation",
    "Privacy Protection and Data Security",
    "Crisis Management and Effective Governance"
],
 'Environmental': [
    "Air and Water Quality Improvement",
    "Energy Efficiency and Sustainability",
    "Biodiversity Conservation and Ecosystem Enhancement",
    "Health and Well-being Enhancement",
    "Waste Management and Pollution Reduction"
],
 'Technological': [
    "Advancement in Research and Innovation",
    "Enhanced Data Security and Privacy",
    "Improved Healthcare Delivery and Outcomes",
    "Increased Efficiency and Productivity",
    "Facilitation of Collaboration and Communication"
],
'Social':[
    "Improved Public Health and Safety",
    "Enhanced Quality of Life and Well-being",
    "Increased Economic Stability and Growth",
    "Strengthened Community Cohesion and Social Capital",
    "Enhanced Environmental Sustainability and Access to Green Spaces",
    "Improved Education and Skill Development",
    "Reduced Social Inequalities and Enhanced Equity",
    "Enhanced Communication and Public Awareness",
    "Improved Mental Health Support and Services",
    "Increased Public Trust and Confidence in Governance and Healthcare"
]}

In [None]:
CategoriesOfBenef = list(DICT.keys())

In [None]:
RES = []
NChunks = 40
#Randomizing list
if os.path.isfile("newbenef_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(BENEFRETURNS)]
    newRisks = [x for x in BENEFRETURNS if x not in alreadythere]
    print("Starting",len(newRisks),"items still to process.")
else:
    newRisks = BENEFRETURNS
    GOOD = []
    
if len(newRisks):
    newRisks = [str(x) for x in newRisks]
    NB = len(newRisks)//NChunks + 1
    CATS_FWK = [GOOD]
    for k in range(NB)[:150]:
        try:
            QUESTION =  "The benefits are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="benefreturns_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the benefits from a high level perspective.\n\n",
                "question": QUESTION,
                "model":"gpt-3.5-turbo",
                "functions": classify_benefit(CategoriesOfBenef),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalbenefitsT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_benefits"])
            d = d [d["category"].isin(CategoriesOfBenef)]
            if not len(GOOD):
                CATS_FWK = d
            else:
                CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC = DFC.sort_values(by=["name","category"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
            DFC.to_parquet("newbenef_classified_lv1.parquet.gzip",compression="gzip")
            #DFC = DFC.sort_values(by=["name","subcategory"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All benefs covered")

Starting 25 items still to process.


Done: 0


#### Checks for Benefits, round 1

In [None]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

1577 items listed


category
Economic          660
Environmental      82
Infrastructure    226
Legal              28
Political          34
Social            424
Technological     123
Name: name, dtype: int64

In [None]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None

In [None]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICT[CAT] # classify_technology(CategoriesOfTechs)

    newRisks = list(df.name)
    RES = []
    NChunks = 50

    if len(newRisks):
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks + 1

        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The benefits are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="benefits_"
                overwrite = False
                REQ = {
                    "context": "Out the following list, classify the benefits from a high level perspective.\n\n",
                    "question": QUESTION,
                    "model":"gpt-3.5-turbo",
                    "functions": classify_benefit(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalBenefsT3.5",
                    "seed" : ""
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_benefits"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC = DFC.sort_values(by=["name","category"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
                DFC.to_parquet("newbenef_classified_lv1.parquet.gzip",compression="gzip") 
                print("Done:",k)
            except:
                print("error")
                pass

Doing Infrastructure


error
error
error
error
Done: 4
Doing Technological
error
Done: 1
Done: 2
Doing Social
error
error
error
error


In [None]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["subcategory"]).name.count()

21072 items listed


subcategory
Access to infrastructure                              255
Air quality and pollution effects                     243
Artificial intelligence and machine learning risks    130
Biodiversity and ecological impacts                   142
Bioterrorism threats                                    3
                                                     ... 
Wastewater and sanitation risks                       138
Weak local decision-making mechanisms                 204
Workforce and labor shortages                          99
Workplace Safety and Health                             7
Workplace safety and occupational exposure            804
Name: name, Length: 91, dtype: int64