In [1]:
%load_ext autoreload
%autoreload 2

from owlready2 import *
import owlready2
print(owlready2.VERSION)

import pandas as pd
import glob, os, hashlib
import requests, json
from datetime import datetime

import random

def NOW():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return "Current Time = "+ str(current_time)

from dotenv import load_dotenv
load_dotenv('.env')

URL = os.getenv("KG_URL_FCT")
TOK3N = os.getenv("KG_TOKEN")

NBCAP = 5000



0.40


In [2]:
import importlib.util
import sys

spec = importlib.util.spec_from_file_location("llm", "../../../utils/llm.py")
h = importlib.util.module_from_spec(spec)
sys.modules["llm"] = h
spec.loader.exec_module(h)

spec = importlib.util.spec_from_file_location("rdfutils", "../../../utils/rdfutils.py")
u = importlib.util.module_from_spec(spec)
sys.modules["rdfutils"] = u
spec.loader.exec_module(u)

In [3]:
onto = get_ontology("./WIP_2.owl").load()
u.cOnto(onto)


# Overview

*  _PBNThing_  --  87682 instances.
*  _BenefReturn_  --  1598 instances.
*  _Benef_  --  1406 instances.
*  _Article_  --  370 instances.
*  _Risk_  --  21139 instances.
*  _ISO_Scale_  --  2 instances.
*  _RiskHealth_  --  11 instances.
*  _RiskType_  --  20 instances.
*  _Stakeholder_  --  16120 instances.
*  _Stakeholder_Type_  --  3 instances.
*  _Technology_  --  23135 instances.
*  _ISO_Purpose_  --  6 instances.
*  _StakeholderGroup_  --  18 instances.
*  _StakeholderSubgroup_  --  168 instances.
*  _TechGroup_  --  15 instances.
*  _TechSubgroup_  --  127 instances.
*  _Mitigation_  --  22944 instances.
*  _BP_Enabler_  --  27 instances.
*  _BP_Transmission_  --  7 instances.
*  _Blueprint_  --  23 instances.
*  _BP_Scale_  --  2 instances.
*  _BP_Phase_  --  4 instances.
*  _BP_Permanent_  --  3 instances.
*  _BP_Type_  --  2 instances.
*  _BP_Intervention_  --  40 instances.
*  _PBNCategory_  --  330 instances.
*  _RiskGroup_  --  9 instances.
*  _RiskSubgroup_ 

# Adding classifications

## Benefits


In [4]:
BENEFRETURNS = [x.label[0] for x in onto.BenefReturn.instances() if len(x.label) >= 1]
len(BENEFRETURNS),len(list(set(BENEFRETURNS)))

(1598, 1598)

In [5]:
x = list(onto.Benef.instances())[0]
x.label,x.has_BenefReturn[0].label

 ['Improved public health response, reduced transmission, better control of outbreaks'])

In [6]:
def classify_benefit(CategoriesOfBenef):
    return [
        {
        "name": "classify_benefits",
        "description": "Function used to classify benefits from a high level perspective.",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_benefits": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of benefit ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The benefit being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":CategoriesOfBenef,
                                "description": 'The category of benefit that matches the benefit being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct benefit category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_benefits"],
        },
    }
]

In [7]:
DICT = {'Political': [
    "Improved Governance and Decision-Making",
    "Increased Public Trust and Compliance",
    "Enhanced Policy Development and Implementation",
    "Economic Stability and Investment Attraction",
    "Crisis Management and Public Health Response"
],
 'Infrastructure': [
    "Operational Efficiency and Adaptability",
    "Tenant Satisfaction and Attractiveness",
    "Health and Safety Improvements",
    "Environmental Sustainability and Green Practices",
    "Economic Value and Asset Enhancement",
    "Supply Chain Reliability and Market Access",
    "Innovative Technologies and Research Advancements",
    "Urban Resilience and Community Well-being",
    "Project Management and Construction Excellence",
    "Public Health and Epidemiological Contributions"
],
 'Economic': [
    "Health and Safety Improvements",
    "Asset and Property Value Enhancement",
    "Operational and Supply Chain Efficiency",
    "Economic Stability and Growth",
    "Environmental Sustainability",
    "Innovation and Investment Attraction",
    "Workforce and Workplace Enhancement",
    "Consumer and Market Benefits",
    "Resource Management and Cost Savings",
    "Social and Community Development"
],
 'Legal': [
    "Compliance and Regulatory Adherence",
    "Risk Management and Liability Reduction",
    "Contractual Stability and Fair Compensation",
    "Privacy Protection and Data Security",
    "Crisis Management and Effective Governance"
],
 'Environmental': [
    "Air and Water Quality Improvement",
    "Energy Efficiency and Sustainability",
    "Biodiversity Conservation and Ecosystem Enhancement",
    "Health and Well-being Enhancement",
    "Waste Management and Pollution Reduction"
],
 'Technological': [
    "Advancement in Research and Innovation",
    "Enhanced Data Security and Privacy",
    "Improved Healthcare Delivery and Outcomes",
    "Increased Efficiency and Productivity",
    "Facilitation of Collaboration and Communication"
],
'Social':[
    "Improved Public Health and Safety",
    "Enhanced Quality of Life and Well-being",
    "Increased Economic Stability and Growth",
    "Strengthened Community Cohesion and Social Capital",
    "Enhanced Environmental Sustainability and Access to Green Spaces",
    "Improved Education and Skill Development",
    "Reduced Social Inequalities and Enhanced Equity",
    "Enhanced Communication and Public Awareness",
    "Improved Mental Health Support and Services",
    "Increased Public Trust and Confidence in Governance and Healthcare"
]}

In [8]:
CategoriesOfBenef = list(DICT.keys())

In [9]:
RES = []
NChunks = 40
#Randomizing list
if os.path.isfile("newbenef_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(BENEFRETURNS)]
    newRisks = [x for x in BENEFRETURNS if x not in alreadythere]
    print("Starting",len(newRisks),"items still to process.")
else:
    newRisks = BENEFRETURNS
    GOOD = []
    
if len(newRisks):
    newRisks = [str(x) for x in newRisks]
    NB = len(newRisks)//NChunks + 1
    CATS_FWK = [GOOD]
    for k in range(NB)[:150]:
        try:
            QUESTION =  "The benefits are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="benefreturns_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the benefits from a high level perspective.\n\n",
                "question": QUESTION,
                "model":"gpt-3.5-turbo",
                "functions": classify_benefit(CategoriesOfBenef),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalbenefitsT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_benefits"])
            d = d [d["category"].isin(CategoriesOfBenef)]
            if not len(GOOD):
                CATS_FWK = d
            else:
                CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC = DFC.sort_values(by=["name","category"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
            DFC.to_parquet("newbenef_classified_lv1.parquet.gzip",compression="gzip")
            #DFC = DFC.sort_values(by=["name","subcategory"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All benefs covered")

Starting 24 items still to process.


Done: 0


#### Checks for Benefits, round 1

In [10]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

1597 items listed


category
Economic          659
Environmental      83
Infrastructure    229
Legal              28
Political          34
Social            440
Technological     124
Name: name, dtype: int64

In [11]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None

In [58]:
overwrite=False

In [72]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICT[CAT] # classify_technology(CategoriesOfTechs)

    newRisks = list(df.name)
    RES = []
    NChunks = 7

    if len(newRisks):
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks + 2

        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The benefits are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="benefits_"
                overwrite = True
                REQ = {
                    "context": "Out the following list, classify the benefits from a high level perspective.\n\n",
                    "question": QUESTION,
                    "model":"gpt-3.5-turbo",
                    "functions": classify_benefit(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalBenefsT3.5",
                    "seed" : "1"
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_benefits"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC = DFC.sort_values(by=["name","category"],ascending=True).drop_duplicates(subset=["name"]).reset_index(drop=True)
                DFC.to_parquet("newbenef_classified_lv1.parquet.gzip",compression="gzip") 
                print("Done:",k)
            except:
                print("error")
                pass

Doing Infrastructure
Doing Technological
Doing Social
Done: 0
Done: 1
Doing Environmental
Doing Economic
Done: 0
Done: 1
Done: 2
Done: 3
Doing Political
Doing Legal


In [74]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["subcategory"]).name.count().sum()

1597 items listed


1579

In [75]:
DFC = pd.read_parquet("newbenef_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC[DFC.subcategory.isna()].groupby(["category"]).name.count()

1597 items listed


category
Economic    17
Social       1
Name: name, dtype: int64

In [71]:
DFC[DFC.subcategory.isna()].name.unique()

array(['Comfort and reduced annoyance.',
       'Private investors can benefit from increased property value and demand for buildings with isolation rooms, as it provides a sense of safety and security for residents.',
       'Private investors can benefit from increased property value and demand for buildings with well-designed outdoor spaces, as it enhances the overall living experience and promotes a healthier lifestyle.',
       'Private investors can benefit from increased property value and reduced energy costs for homeowners, leading to increased demand for energy-efficient buildings.',
       'Private investors can benefit from increased property value as residents are willing to pay more for a healthier living environment.',
       'Private investors can benefit from increased property values and demand for housing in areas with green infrastructure.',
       'Private investors can benefit from increased property values and demand for housing in areas with resilient green infr