In [1]:
%load_ext autoreload
%autoreload 2

from owlready2 import *
import owlready2
print(owlready2.VERSION)

import pandas as pd
import glob, os, hashlib
import requests, json
from datetime import datetime

import random

def NOW():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return "Current Time = "+ str(current_time)

from dotenv import load_dotenv
load_dotenv('.env')

URL = os.getenv("KG_URL_FCT")
TOK3N = os.getenv("KG_TOKEN")

NBCAP = 5000



0.40


In [2]:
import importlib.util
import sys

spec = importlib.util.spec_from_file_location("llm", "../../../utils/llm.py")
h = importlib.util.module_from_spec(spec)
sys.modules["llm"] = h
spec.loader.exec_module(h)

spec = importlib.util.spec_from_file_location("rdfutils", "../../../utils/rdfutils.py")
u = importlib.util.module_from_spec(spec)
sys.modules["rdfutils"] = u
spec.loader.exec_module(u)

In [3]:
onto = get_ontology("./WIP.owl").load()
u.cOnto(onto)

# Adding classifications

## Techs


In [4]:
TECHS = [x.label[0] for x in onto.Technology.instances() if len(x.label) >= 1]
len(TECHS)

23103

In [5]:
def classify_technology(CategoriesOfTechs):
    return [
        {
        "name": "classify_technology",
        "description": "Function used to classify technologies from a high level perspective.",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_technology": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of technology ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The technology being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":CategoriesOfTechs,
                                "description": 'The category of technology that matches the technology being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct technology category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_technology"],
        },
    }
]

In [6]:
DICT = {}
DICT["Air Control"] = ['Air Filtering',
 'Indoor Environment',
 'HVAC Systems',
 'Air Quality Monitoring',
 'Air Purification',
 'Ventilation',
 'HEPA Filters',
 'UV-C Disinfection']
DICT["Data Analytics"] = ['Machine Learning and AI Algorithms', 'Real-Time Data Tracking and Monitoring', 'Data Visualization and Modeling', 'Cloud-Based Data Sharing and Analysis', 'Advanced Statistical Modeling and Analysis', 'Social Media Monitoring and Analysis', 'Predictive Analytics and Modeling Tools',  'Information Dissemination Platforms']
DICT["Infrastructure Management"] = ['Sanitation and hygiene facilities', 'Ventilation and air quality systems', 'Geographic information and mapping technology', 'Healthcare infrastructure and telecommunication networks', 'Energy-efficient technologies and smart grid systems', 'Remote work tools and workforce management technologies', 'Logistics and inventory management systems', 'Capacity-building and infrastructure development tools', 'Disaster management and resilience planning technologies']
DICT["Communication"] = ['Communication and Collaboration Platforms', 'Public Awareness Campaign Tools', 'Video Conferencing and Collaboration Software', 'Social Media Platforms', 'Virtual Communication and Collaboration Technologies', 'Digital Signage and Messaging Systems', 'Multilingual Communication and Outreach Tools', 'Online Public Engagement Platforms', 'Remote Work and Telecommuting Technologies', 'Mobile App Notification Systems']
DICT["Sensors and Monitoring"] = ['Monitoring solutions', 'IoT devices', 'Surveillance systems', 'Environmental sensors', 'Sensor technologies', 'Remote monitoring tools', 'Crowd monitoring technology', 'Data collection systems', 'Real-time tracking systems']
DICT["Healthcare"] = ['Diagnostic Testing', 'Sequencing and Genomic Analysis', 'Analytical Chemistry', 'Physical Health Monitoring', 'Wellbeing and Mental Health', 'Social Distancing Technologies', 'Vaccination Development and Distribution', 'Personal Hygiene and Sanitation', 'Personal Protective Equipment (PPE)', 'Remote Healthcare and Telemedicine']
DICT["Urban Planning"] = ['Community Management Technologies', 'Spatial Planning and Layout Tools', 'Green Infrastructure Management', 'Crowd Control Solutions', 'Smart City Technologies', 'Urban Planning Solutions', 'Resilience Planning Frameworks', 'Urban Design and Mapping Tools', 'Environmental Conservation Technologies']
DICT["Digital and Information Management"] = ['Data Management', 'Web Development', 'Network Security', 'Cloud Services', 'Mobile Applications', 'Knowledge Management', 'Ontology Tools', 'Data Governance', 'Digital Infrastructure', 'Information Systems']
DICT["Building design"] = ['Building design', 'Ventilation systems', 'Home design and architecture', 'Sustainable design', 'Soundproofing technology', 'Green building strategies', 'Space optimization technologies', 'Architectural design software', 'Adaptive construction methods', 'Virtual planning and design tools']
DICT["Privacy Enhancing Technologies"] = ['Encryption and authentication', 'Biometrics and identification', 'Face and voice recognition', 'Data encryption and security', 'Cybersecurity and access control', 'Quantum cryptography', 'Ethical guidelines and advocacy', 'Fact-checking and information verification', 'Blockchain for data privacy and transparency']
DICT["Building Management"] = ['Access control and security technology', 'Lighting and daylighting systems', 'Maintenance and landscaping equipment', 'Door-locking and automatic systems', 'Operation of HVAC and ventilation systems', 'Building management and automation systems', 'Industry standard compliance tools', 'Safety and hazard detection systems']
DICT["Cleaning"] = ['Automated sanitation equipment', 'Advanced disinfection systems', 'Surface cleaning', 'Waste management technologies group', 'Sanitization equipment group', 'Disinfectant formulation', 'Self-cleaning technologies', 'Hygiene maintenance tools']
DICT["Supply-chain"] = ['Diversification and suppliers', 'Supply chain and risk management',  'Manufacturing of PPE', 'Digital supply chain management', 'Contactless delivery']
DICT["Water Control"] = ['Water and Wastewater Management', 'Water Treatment Systems', 'Water Quality Monitoring', 'Water Disinfection Solutions',  'Advanced Filtration Technologies', 'Sewage Treatment Facilities', 'Water Recycling and Conservation']
DICT["Waste Management"] = ['Recycling and waste handling technologies', 'Advanced waste sorting and processing systems', 'Smart waste handling and monitoring technologies', 'Contactless recycling and waste disposal solutions', 'Automated waste collection and segregation technologies', 'Sanitization and sterilization technologies for waste', 'Improved waste storage and disposal systems']

CategoriesOfTechs = list(DICT.keys())

### Doing Level 1 of technologies

In [7]:
RES = []
NChunks = 50
#Randomizing list
if os.path.isfile("../../support/newtechs_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("../../support/newtechs_classified_lv1.parquet.gzip")
    GOOD = GOOD[GOOD.name.isin(TECHS)]
    alreadythere = GOOD["name"].unique()
    newRisks = [x for x in TECHS if x not in alreadythere]
    CATS_FWK = [GOOD]
else:
    newRisks = TECHS
    CATS_FWK = []

if len(newRisks):
    newRisks = [str(x) for x in newRisks]
    NB = len(newRisks)//NChunks + 1

    for k in range(NB):
        try:
            QUESTION =  "The technologies are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="technologies_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the technologies from a high level perspective.\n\n",
                "question": QUESTION,
                "functions": classify_technology(CategoriesOfTechs),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalTechsT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_technology"])
            d = d [d["category"].isin(CategoriesOfTechs)]
            CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC.to_parquet("../../support/newtechs_classified_lv1.parquet.gzip",compression="gzip")
            print("Done:",k)
        except:
            print("error")
            pass

Done: 0
Done: 1
Done: 2


### Doing Level 2 of technologies

In [8]:
DFC = pd.read_parquet("../../support/newtechs_classified_lv1.parquet.gzip")
print(len(DFC))
DFC.head(2)

24986


Unnamed: 0,name,category,confidence,co```nfidence,&confidence,subcategory,confidence_lv2
0,Nan,Privacy Enhancing Technologies,Low,,,Encryption and authentication,Low
1,Floor markings,Urban Planning,Medium-high,,,Spatial Planning and Layout Tools,High


In [9]:
DFC = pd.read_parquet("../../support/newtechs_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICT[CAT] # classify_technology(CategoriesOfTechs)

    newRisks = list(df.name)
    RES = []
    NChunks = 50

    if len(newRisks):
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks + 1

        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The technologies are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="technologies_"
                overwrite = False
                REQ = {
                    "context": "Out the following list, classify the technologies from a high level perspective.\n\n",
                    "question": QUESTION,
                    "functions": classify_technology(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalTechsT3.5",
                    "seed" : ""
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_technology"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC.to_parquet("../../support/newtechs_classified_lv1.parquet.gzip",compression="gzip")
                print("Done:",k)
            except:
                print("error")
                pass

Doing Privacy Enhancing Technologies
Done: 0
Doing Urban Planning
Done: 0
Done: 1
Done: 2
Doing Healthcare
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Doing Communication
Done: 0
Done: 1
Done: 2
Doing Building design
Done: 0
Done: 1
Doing Water Control
Done: 0
Doing Data Analytics
Done: 0
error
Done: 2
Done: 3
Done: 4
Doing Infrastructure Management
error
Done: 1
Done: 2
Doing Sensors and Monitoring
Done: 0
Done: 1
Done: 2
Doing Building Management
Done: 0
Done: 1
Doing Digital and Information Management
Done: 0
Done: 1
Done: 2
Done: 3
Doing Cleaning
Done: 0
Done: 1
Doing Air Control
Done: 0
Done: 1
Done: 2
Doing Waste Management
Done: 0
Doing Supply-chain
Done: 0
Done: 1


### Classification 

In [1]:
import pandas as pd
DFC = pd.read_parquet("../../support/newtechs_classified_lv1.parquet.gzip")
DFC[~DFC.subcategory.isna()].groupby(["category","subcategory"]).name.count().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,name
category,subcategory,Unnamed: 2_level_1
Air Control,Advanced Filtration Technologies,2
Air Control,Advanced disinfection systems,1
Air Control,Air Filtering,82
Air Control,Air Purification,127
Air Control,Air Quality Monitoring,136
...,...,...
Water Control,Water Disinfection Solutions,29
Water Control,Water Quality Monitoring,66
Water Control,Water Recycling and Conservation,20
Water Control,Water Treatment Systems,60


## Stakeholders

In [11]:
SH = [x.label[0] for x in onto.Stakeholder.instances() if len(x.label) >= 1]
len(SH)

16071

In [12]:
DICT = {'Health Organisations': ['Public health authorities',
  'Occupational health and safety teams',
  'Government health agencies ',
  'International health organizations',
  'Medical and healthcare personnel',
  'Private health organisations',
  'Mental healthcare providers',
  'Veterinary organization'],
 'Research Organisations': ['AI and Data analysis researchers',
  'Research institutions ',
  'Technology developers',
  'Academic institutions',
  'Data repository administrators',
  'Local researchers',
  'Citizen researchers',
  'Scientific communities'],
 'Government institutions': ['City authorities',
  'Government departments ',
  'Policymakers',
  'Diplomatic bodies',
  'Local government',
  'Educational organizations',
  'Political communities',
  'Administrative authorities',
  'Regulatory bodies ',
  'Emergency services and management',
  'International development agencies'],
 'General public': ['Immigrant population',
  'Patients',
  'Travelers',
  'Civil society stakeholders',
  'Recreational and fitness enthusiasts',
  'Visitors in indoor spaces',
  'General population ',
  'Digitally connected individuals',
  'Fragile and/or isolated population',
  'Individual with high health risks',
  'Green spaces users '],
 'Environmental and sustainability organizations': ['Environmental agencies',
  'Conservation organizations',
  'Park management stakeholders',
  'Waste management organizations',
  'Sustainability teams ',
  'Energy and environmental consultants',
  'Green area managers and organizations',
  'Wildlife and ecosystem conservationists',
  'Climate and sustainability research institutions'],
 'Residents and occupants of spaces': ['Homeowners ',
  'Building managers',
  'People with lower income',
  'People in high-contamination areas',
  'Occupants of shared household spaces',
  'Visitors to green spaces',
  'Individuals working from home'],
 'Workers': ['Office workers',
  'Physical workers',
  'Healthcare workers',
  'Real estate agents',
  'Facility maintenance team',
  'AI system developers',
  'Waste management workers',
  'Project managers'],
 'Regulatory bodies': ['Global governing bodies',
  'Trade regulators',
  'Enforcement organizations',
  'Certification system organizations',
  'Standardization bodies',
  'Data governance organizations',
  'Governments and industry regulatory bodies',
  'Security agency',
  'Occupational health and safety regulators',
  'Local regulatory authorities',
  'International trade regulators'],
 'Public Entities': ['Sports-related organizations',
  'Appointed personnel responsible for prevention measures',
  'Education-related entities',
  'Water utilities',
  'Energy Utilities',
  'Tourism organizations ',
  'Culture organizations',
  'Police ',
  'Public communication authorities'],
 'Financial groups': ['Banking institutions',
  'Funding Bodies',
  'Economic analysts and regulatory authorities',
  'International financial institutions',
  'Treasury departments',
  'Small-and medium-sized enterprises',
  'Companies in general',
  'Global markets',
  'Insurances providers',
  'Finance professionals',
  'Social entrepreneurship fund managers',
  'Investors',
  'Social entrepreneurship funds'],
 'Construction and infrastructure industry': ['Building design team',
  'Workers in the construction industry',
  'Construction industry organizations',
  'Contractors ',
  'HVAC equipment providers',
  'Site-supervision workers',
  'Operational staff',
  'Design and construction professionals',
  'Building maintenance and green building professionals',
  'Facility maintenance staff'],
 'Manufacturing and production sector': ['Production staff',
  'Non-medical cechnology manufacturers',
  'Medical technology manufacturers',
  'Food industry companies',
  'International suppliers',
  'Supply chain professionals',
  'Wearable technology industry',
  'Pharmaceutical industry'],
 'Transportation and mobility sector': ['Logistics professionals',
  'Ride-sharing ',
  'City transport authorities',
  'Vehicle users',
  'Safe mobility',
  'Soft mobility',
  'Route optimization specialists',
  'Airports',
  'Ports',
  'Railways',
  'Delivery services',
  'Logistics and transportation managers'],
 'Cities professionals': ['City planning professionals',
  'Urban development authorities',
  'Individual urban planners and designers',
  'Urban planning institutions',
  'Smart city professionals',
  'Sustainable city professionals',
  'International urban institutions',
  'Public space managers'],
 'Retail and service industry': ['Cleaning personnel',
  'Retail staff',
  'Marketing agency',
  'Local businesses',
  'Food distribution',
  'Animal rearing industry',
  'Online retailers',
  'Drive-through staff',
  'Supermarket and health-care facility staff',
  'Parking lot owners',
  'Local fitness businesses'],
 'Energy Sector': ['Energy monitoring system providers',
  'Operators and maintenance staff ',
  'Renewable energy investors',
  'Energy management authorities',
  'Energy communities',
  'Communities reliant on the coal and oil industries',
  'Global energy markets',
  'Local energy markets',
  'Energy grid operators'],
 'Media organizations': ['Local communication experts',
  'Individuals consuming digital content',
  'Media and news entities',
  'Marketing/advertising agencies',
  'Reporter',
  'Social media platform providers',
  'Academic journals',
  'News outlet'],
 'Cities Officials': ['Municipality officials',
  'Public building managers',
  'City authorities ',
  'Public space managers',
  'Local government and urban development authorities',
  'Community leaders',
  'Smart city managers',
  'Other cities departments']}

In [13]:
def classify_SH(CategoriesOfStakeholders):
    return [
        {
        "name": "classify_SH",
        "description": "Function used to classify stakeholders from a high level perspective, based on what group they are most closely belonging to. ",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_stakeholders": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of stakeholder ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The stakeholder being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":CategoriesOfStakeholders,
                                "description": 'The category of stakeholder that matches the stakeholder category being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct stakeholder category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_stakeholders"],
        },
    }
]

In [14]:
CategoriesOfStakeholders = DICT.keys()
RES = []
NChunks = 50
#Randomizing list
if os.path.isfile("../../support/newstakeholders_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("../../support/newstakeholders_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(SH)]
    newRisks = [x for x in SH if x not in alreadythere]
    CATS_FWK = [GOOD]
else:
    newRisks = SH
    CATS_FWK = []
if len(newRisks):
    NB = len(newRisks)//NChunks + 1
    for k in range(NB):
        try:
            QUESTION =  "The stakeholders are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="sh_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the stakeholders from a high level perspective.\n\n",
                "question": QUESTION,
                "functions": classify_SH(list(CategoriesOfStakeholders)),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalSHT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_stakeholders"])
            d = d [d["category"].isin(CategoriesOfStakeholders)]
            CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC.to_parquet("../../support/newstakeholders_classified_lv1.parquet.gzip",compression="gzip")
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All risks covered")

Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7


In [15]:
import pandas as pd
DFC = pd.read_parquet("../../support/newstakeholders_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

17241 items listed


category
Cities Officials                                   113
Cities professionals                               135
Construction and infrastructure industry           534
Energy Sector                                      118
Environmental and sustainability organizations     698
Financial groups                                   541
General public                                    2692
Government institutions                           1279
Health Organisations                              2037
Manufacturing and production sector                501
Media organizations                                201
Public Entities                                    615
Regulatory bodies                                  380
Research Organisations                            1619
Residents and occupants of spaces                 3393
Retail and service industry                        318
Transportation and mobility sector                 275
Workers                                           1792
N

In [16]:
DFC = pd.read_parquet("../../support/newstakeholders_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICT[CAT] # classify_technology(CategoriesOfTechs)

    newRisks = list(df.name)
    RES = []
    NChunks = 50

    if len(newRisks):
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks + 1

        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The stakeholders are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="technologies_"
                overwrite = False
                REQ = {
                    "context": "Out the following list, classify the stakeholders from a high level perspective.\n\n",
                    "question": QUESTION,
                    "functions": classify_SH(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalTechsT3.5",
                    "seed" : ""
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_stakeholders"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC.to_parquet("../../support/newstakeholders_classified_lv1.parquet.gzip",compression="gzip")
                print("Done:",k)
            except:
                print("error")
                pass

Doing Research Organisations
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Doing Government institutions
Done: 0
Done: 1
Done: 2
Doing General public
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Doing Environmental and sustainability organizations
Done: 0
Done: 1
Done: 2
Doing Public Entities
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Doing Financial groups
Done: 0
Done: 1
Doing Residents and occupants of spaces
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Done: 8
Done: 9
Done: 10
Done: 11
Done: 12
Done: 13
Done: 14
Done: 15
Done: 16
Done: 17
Doing Health Organisations
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Doing Workers
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Doing Regulatory bodies
Done: 0
Done: 1
Done: 2
Doing Construction and infrastructure industry
Done: 0
Done: 1
Doing Cities Officials
Done: 0
Done: 1
Doing Cities professionals
Done: 0
Doing Retail and service industry
Done: 0
Done: 1
Doing Transportation and mobility sector
Done: 0
D

## Risks


In [17]:
RISKS = [x.label[0] for x in onto.Risk.instances() if len(x.label) >= 1]
len(RISKS)

20903

In [18]:
DICT = {'Political risks': ['Trust and transparency challenges',
  'Globalization challenges',
  'Lack of global governance and coordination',
  'Weak local decision-making mechanisms',
  'Lack of coordination and consistency in mitigation measures',
  'Ineffectiveness of interventions',
  'Bioterrorism threats',
  'Global coordination challenges',
  'Lack of transparency and clarity'],
 'Social risks': ['Disinformation and Misinformation',
  'Social and Psychological Effects ',
  'Social Justice Concerns',
  'Communication and Messaging',
  'Access to infrastructure  ',
  'Physical activity and mobility',
  'Misinformation and Media Risks',
  'Cultural and Lifestyle Risks',
  'Vulnerability risks',
  'Occupational and Employment Risks',
  'Mental and Emotional Wellbeing',
  'Health Behaviors and Compliance'],
 'Infrastructure risks': ['Inadequate remote work setup for employees',
  'Improper architectural designs',
  'Inadequate mobility schemes',
  'Poor urban planning',
  'Infrastructure maintenance risk ',
  'Lack of emphasis on air quality ',
  'Lack of emphasis on water quality ',
  'Insufficient adoption of health-focused building standards',
  'Insufficient adoption of health-focused urban standards',
  'Inadequate hospital design ',
  'Inadequate adaptability to emerging problems  ',
  'Disruptions from outbreaks and other major external events'],
 'Public health risks': ['Failure to implement preventive measures',
  'Misinterpretation of data and results',
  'Lack of access to health services',
  'Public health messaging and education',
  'Lack of effective medical solutions',
  'Healthcare resource shortage and management',
  'Workplace safety and occupational exposure',
  'Poor access to Diagnostics and Treatment',
  'Poor vector-borne diseases control'],
 'Data Privacy and Security risks': ['Data Security risks',
  'Privacy risks',
  'Cybersecurity Threats',
  'Inaccurate or Incomplete Data',
  'Misuse of Personal Data',
  'Reliability and Quality of Data',
  'Surveillance and Monitoring Privacy Concerns',
  'Limited Accessibility and Data Equity',
  'Transparency and Accountability Issues',
  'Potential Misinformation and False Data'],
 'Economic risks': ['Supply chain disruptions',
  'Pandemic impact on markets',
  'Lower investment',
  'Lower employment',
  'Discontinuity of services',
  'Global recession and economic instability',
  'Construction industry impacts',
  'Resource and capability risks',
  'Workforce and labor shortages',
  'Financial strains and liquidity crises',
  'Food supply chain disruptions '],
 'Legal risks': ['Compliance and Contractual',
  'Data and Research Integrity',
  'Intellectual Property',
  'Workplace Safety and Health',
  'Privacy and Civil Liberties',
  'Fraud and Misconduct',
  'Healthcare Legislation',
  'Building Codes and Certification',
  'Technology Deployment and Surveillance'],
 'Environmental risks': ['Climate change',
  'Pollution increase',
  'Environmental health risks',
  'Air quality and pollution effects',
  'Urbanization and habitat loss',
  'Climate change effects',
  'Wastewater and sanitation risks',
  'Biodiversity and ecological impacts',
  'Energy consumption and sustainability issues'],
 'Technological risks': ['Data privacy and security risks',
  'Healthcare and medical risks',
  'Infrastructure and connectivity risks',
  'Modeling and simulation risks',
  'Modeling and prediction risks',
  'Artificial intelligence and machine learning risks',
  'Inadequate ventilation and air circulation',
  'Operational and implementation risks ',
  'Innovation and adoption risks',
  'Healthcare infrastructure and technology adoption risks ']}

In [19]:
CategoriesOfRisks = list(DICT.keys())

In [20]:
def classify_risks_PESTLE(CategoriesOfRisks):
    return [
        {
        "name": "classify_risks_PESTLE",
        "description": "Function used to classify risks from a high level perspective, based on what they  impact the most . ",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_risks": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of risk ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The risk being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":CategoriesOfRisks,
                                "description": 'The category of risk that matches the risk category being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct risk category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_risks"],
        },
    }
]

In [21]:
RES = []
NChunks = 50
#Randomizing list
if os.path.isfile("../../support/newrisks_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("../../support/newrisks_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(RISKS)]
    newRisks = [x for x in RISKS if x not in alreadythere]
else:
    newRisks = RISKS
    
if len(newRisks):
    newRisks = [str(x) for x in newRisks]
    NB = len(newRisks)//NChunks + 1
    CATS_FWK = [GOOD]
    for k in range(NB)[:150]:
        try:
            QUESTION =  "The risks are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="risks_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the risks from a high level perspective.\n\n",
                "question": QUESTION,
                "functions": classify_risks_PESTLE(CategoriesOfRisks),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalrisksT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_risks"])
            d = d [d["category"].isin(CategoriesOfRisks)]
            CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC.to_parquet("../../support/newrisks_classified_lv1.parquet.gzip",compression="gzip")
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All risks covered")

error
error
Done: 2


In [22]:
import pandas as pd
DFC = pd.read_parquet("../../support/newrisks_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

20802 items listed


category
Data Privacy and Security risks     914
Economic risks                     2250
Environmental risks                1615
Infrastructure risks               2899
Legal risks                         208
Political risks                    1126
Public health risks                6277
Social risks                       3446
Technological risks                2067
Name: name, dtype: int64

In [23]:
DFC

Unnamed: 0,name,category,confidence,subcategory,confidence_lv2
0,Inadequate government interventions for revita...,Political risks,Medium,Weak local decision-making mechanisms,Medium
1,Encouragement of remote working and digital tw...,Social risks,Medium,Social and Psychological Effects,High
2,Inadequate workplace modification activities a...,Infrastructure risks,Medium,Inadequate remote work setup for employees,High
3,Substantial underestimation of infection cases...,Public health risks,Medium,Misinterpretation of data and results,Medium-high
4,The use of bluetooth and gps location data for...,Data Privacy and Security risks,High,Privacy risks,Medium
...,...,...,...,...,...
20797,Regulaci\u00f3n del teletrabajo a nivel mundial,Political risks,High,,
20798,Propagaci\u00f3n acelerada del virus debido a ...,Social risks,High,,
20799,Transmisi\u00f3n de persona a persona del viru...,Social risks,High,,
20800,El virus sars-cov-2 afecta a una amplia gama d...,Environmental risks,High,,


In [24]:
DFC = pd.read_parquet("../../support/newrisks_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None

In [25]:
DFC = pd.read_parquet("../../support/newrisks_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICT[CAT] # classify_technology(CategoriesOfTechs)

    newRisks = list(df.name)
    RES = []
    NChunks = 50

    if len(newRisks):
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks + 1

        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The risks are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="technologies_"
                overwrite = False
                REQ = {
                    "context": "Out the following list, classify the risks from a high level perspective.\n\n",
                    "question": QUESTION,
                    "functions": classify_risks_PESTLE(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalTechsT3.5",
                    "seed" : ""
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_risks"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC.to_parquet("../../support/newrisks_classified_lv1.parquet.gzip",compression="gzip")
                print("Done:",k)
            except:
                print("error")
                pass

Doing Political risks
Done: 0
Done: 1
error
error
Done: 4
Doing Social risks
Done: 0
error
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Done: 8
error
Done: 10
Doing Infrastructure risks
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
error
Done: 7
Done: 8
Done: 9
error
Done: 11
error
Done: 13
Done: 14
Done: 15
Done: 16
Done: 17
Done: 18
Done: 19
Doing Public health risks
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
error
error
Done: 10
Done: 11
Done: 12
Done: 13
Done: 14
Done: 15
Done: 16
Done: 17
error
error
Done: 20
Done: 21
Done: 22
Done: 23
error
Done: 25
error
error
error
Done: 29
Doing Data Privacy and Security risks
Done: 0
Done: 1
Done: 2
Doing Economic risks
Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Doing Legal risks
Done: 0
Doing Environmental risks
Done: 0
Done: 1
Done: 2
Done: 3
Doing Technological risks
error
error
Done: 2
error
Done: 4
error
Done: 6
Done: 7
error
Done: 9
Done: 10


In [26]:
import pandas as pd
DFC = pd.read_parquet("../../support/newrisks_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["subcategory"]).name.count()

20802 items listed


subcategory
Access to infrastructure                              240
Air quality and pollution effects                     237
Artificial intelligence and machine learning risks     99
Biodiversity and ecological impacts                   140
Bioterrorism threats                                    3
                                                     ... 
Wastewater and sanitation risks                       136
Weak local decision-making mechanisms                 188
Workforce and labor shortages                          99
Workplace Safety and Health                             7
Workplace safety and occupational exposure            730
Name: name, Length: 91, dtype: int64

In [27]:
import pandas as pd
DFC = pd.read_parquet("../../support/newmitigation_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

20784 items listed


category
Absorb resource shortages                                             301
Absorb surge in service demande                                       436
Air filtration                                                        339
Animal vectors control                                                362
Cleaning technologies (robots)                                        140
Control airflows in spaces & Avoid air recirculation                  535
Control of airflows (airtight / negative pressure)                     74
Disinfection of air                                                   288
Early detection                                                       527
Ensure use of personal protection equipment                           468
Environmental parameters control                                     1839
Facilitate transparent communication                                  740
Greenery                                                              408
Greywater management         

In [28]:
DFC

Unnamed: 0,name,category,confidence,...
0,"Implement measures to reduce air pollution, pr...",Environmental parameters control,High,
1,Promote energy efficiency measures and renewab...,Environmental parameters control,High,
2,"Implement energy management systems, automate ...",Occupancy control (eg separating at/risk conta...,Medium,
3,"Implement demand response programs, promote en...",Shift controls (occupancy control of total users),Medium,
4,Higher ventilation rates,Natural ventilation increase,High,
...,...,...,...,...
20779,Increase the number of studies included or use...,Infection tracking / monitoring,Medium,
20780,Recruit additional volunteers or use alternati...,Infection tracking / monitoring,Medium,
20781,Seek alternative platforms or publications for...,Facilitate transparent communication,Medium-high,
20782,"Implementation of additional, more effective p...",Maintenance / decontamination of building wate...,Medium-high,


## Mitigs


In [29]:
m = [x.label[0] for x in onto.Mitigation.instances() if len(x.label) >= 1]
len(m)

22926

In [30]:
interventions = [x.label[0] for x in onto.BP_Intervention.instances()]

In [31]:
def classify_mitigation(interventions):
    return [
        {
        "name": "classify_mitigation",
        "description": "Function used to classify mitigations from a high level perspective, based on what group they are most closely belonging to. ",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_mitigations": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of mitigation ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The mitigation being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":interventions,
                                "description": 'The category of mitigation that matches the mitigation category being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct mitigation category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_mitigations"],
        },
    }
]

In [32]:
RES = []
NChunks = 50
#Randomizing list
if os.path.isfile("../../support/newmitigation_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("../../support/newmitigation_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(m)]
    newRisks = [x for x in m if x not in alreadythere]
    CATS_FWK = [GOOD]
else:
    newRisks = m
    CATS_FWK = []
if len(newRisks):
    NB = len(newRisks)//NChunks + 1
    for k in range(NB)[:NBCAP]:
        try:
            QUESTION =  "The mitigations are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="sh_"
            overwrite = False
            REQ = {
                "context": "Out the following list, classify the mitigations from a high level perspective.\n\n",
                "question": QUESTION,
                "functions": classify_mitigation(interventions),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalMitigationsT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_mitigations"])
            d = d [d["category"].isin(interventions)]
            CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC.to_parquet("../../support/newmitigation_classified_lv1.parquet.gzip",compression="gzip")
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All mitigations covered")

Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Done: 8
Done: 9
Done: 10
Done: 11
Done: 12
error
error
Done: 15
Done: 16
error
Done: 18
Done: 19
error
Done: 21
Done: 22
Done: 23
Done: 24
Done: 25
error
Done: 27
Done: 28
Done: 29
Done: 30
error
Done: 32
Done: 33
Done: 34
Done: 35
Done: 36
error
error
Done: 39
Done: 40
Done: 41
Done: 42
Done: 43


In [33]:
import pandas as pd
DFC = pd.read_parquet("../../support/newmitigation_classified_lv1.parquet.gzip")
print(len(DFC),"items listed")
DFC.groupby(["category"]).name.count()

21791 items listed


category
Absorb resource shortages                                             306
Absorb surge in service demande                                       472
Air filtration                                                        346
Animal vectors control                                                369
Cleaning technologies (robots)                                        139
Cleaning technologies robots                                            2
Control airflows in spaces & Avoid air recirculation                  542
Control airflows in spaces & avoid air recirculation                   10
Control of airflows (airtight / negative pressure)                     75
Control of airflows airtight / negative pressure                        5
Disinfection of air                                                   291
Early detection                                                       599
Ensure use of personal protection equipment                           485
Environmental parameters cont

# Adding CAO Classification

In [4]:
RISKS = [x.label[0] for x in onto.Risk.instances() if len(x.label) >= 1]
len(RISKS)

20919

In [5]:
DICTS_CAO = {}

DICTS_CAO["Structure"] = ["Settlement","Biodiversity","Air","Soil","Water","Communication Network", "Water Cycle", "Energy Cycle","Mobility Network","Nature","Dwelling","Buildings/Blocks","Neighbourhoods/Districts","City/Metropolis","Public Space"," Land Use"]
DICTS_CAO["Interactions"] = ["Living","Working","Shopping","Transport","Health","Education","Arts","Sports","Security","Wealth production", "Wealth Distribution","Commerce/Trade","Finances", "Competitiveness", "Entrepreneurship","Culture/Diversity","Social Expression","Heritage", "Tools&Apps","Open data", "Data in/out", "Performance (equity, resilience, self-sufficiency)"]
DICTS_CAO["Society"] = ["Person", "Family","Visitors","Organizations","Business","Participation","Capacity Development", "Leadership","Visions&Priorities", "Laws&Regulations","Accountability"]
1


1

In [6]:
DESC_Step1 = "'Structures' contains Environment (Settlements, Biodiversity, Air, Soil, Water), Infrastructure (communication networks, water, energy, matter cycle, mobility,), and Build domain (Dwelling, housing, land use, public spaces). "
DESC_Step1 += "'Interactions' contains Functions, Economy, Culture and Information, focusing on companies, economic sphere, societal, social, economic and information topics. "
DESC_Step1 += "'Society' contains Citizens and Government (inc Persons, familities, organizations, businesses, leadership, vision, laws and regulation, accountability)."
DESC_Step1 = "Use 'Structures' if the risk is in the physical world, 'Interactions' if it is about 'societal, social, economic and information' aspects and Society if talking about governance. "

In [7]:
def classifFwk_step1(lst):
    return [
        {
        "name": "classify_risks_framework",
        "description": "Function used to classify risks according to the evaluation framework, based on what they  impact the most . ",
        "parameters": {
            "type": "object",
            "properties": {
                "sorted_risks": {
                    "type": 'array',
                    "items": {
                        "type": 'object',
                        "description": "A type of risk ",
                        "properties": {
                            "name" :{
                                "type": 'string', 
                                "description": 'The risk being considered.'
                            },
                            "category" :{
                                "type": 'string', 
                                "enum":lst,
                                "description": 'The category of risk that matches the risk category being considered. '
                            },
                            "confidence" :{
                                "type": 'string',  
                                "enum":["High","Medium-high","Medium","Low"],
                                "description": 'Confidence that this is the correct risk category.'
                            }                            
                        },
                        "required": ["name",'category',"confidence"],
                    }
                },
            },
            "required": ["sorted_risks"],
        },
    }
]

In [8]:
CAO_Cats = list(DICTS_CAO.keys())

In [9]:
RES = []
NChunks = 50
#Randomizing list
if os.path.isfile("../../support/newCAO_classified_lv1.parquet.gzip"):
    GOOD = pd.read_parquet("../../support/newCAO_classified_lv1.parquet.gzip")
    alreadythere = GOOD["name"].unique()
    GOOD = GOOD[GOOD.name.isin(RISKS)]
    newRisks = [x for x in RISKS if x not in alreadythere]
    CATS_FWK = [GOOD]
else:
    newRisks = RISKS
    CATS_FWK = []
if len(newRisks):
    NB = len(newRisks)//NChunks + 1
    for k in range(NB)[:NBCAP]:
        try:
            QUESTION =  "The risks are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
            prefix ="sh_"
            overwrite = False
            REQ = {
                "context": DESC_Step1+"\n\nOut the following list, classify the risks from a high level perspective.\n\n",
                "question": QUESTION,
                "functions": classifFwk_step1(CAO_Cats),
                "token": TOK3N,
                "overwrite": overwrite,
                "source": "local-evalMitigationsT3.5",
                "seed" : ""
            }

            H = h.hashme(REQ["context"]+REQ["question"])
            cached = "cache/"+prefix+"_"+H+".json"
            if not os.path.isfile(cached) or overwrite:
                x = requests.post(URL+"fct/", json = REQ)
                answer = json.loads(x.text)["messages"][-1]
                h.svt(cached,answer)
            else:
                answer = h.ldt(cached)
            d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_risks"])
            d = d [d["category"].isin(CAO_Cats)]
            CATS_FWK.append(d)
            DFC = pd.concat(CATS_FWK).reset_index(drop=True)
            DFC.to_parquet("../../support/newCAO_classified_lv1.parquet.gzip",compression="gzip")
            print("Done:",k)
        except:
            print("error")
            pass
else:
    print("All Risks CAO covered")

Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
Done: 8
Done: 9
Done: 10
Done: 11
Done: 12
Done: 13
Done: 14
Done: 15
Done: 16
Done: 17
Done: 18
Done: 19
Done: 20
Done: 21
Done: 22
Done: 23
Done: 24
Done: 25
Done: 26
Done: 27
Done: 28
Done: 29
Done: 30
Done: 31
Done: 32
Done: 33
Done: 34
error
error
Done: 37
error
Done: 39
Done: 40
Done: 41
Done: 42
Done: 43
Done: 44
Done: 45
Done: 46
Done: 47
Done: 48
Done: 49
Done: 50
Done: 51
error
Done: 53
Done: 54
Done: 55
error
Done: 57
Done: 58
Done: 59
Done: 60
error
Done: 62
Done: 63
Done: 64
Done: 65
Done: 66
Done: 67
Done: 68
Done: 69
Done: 70
Done: 71
Done: 72
Done: 73
Done: 74
Done: 75
Done: 76
Done: 77
Done: 78
Done: 79
Done: 80
Done: 81
Done: 82
Done: 83
Done: 84
error
Done: 86
Done: 87
Done: 88
Done: 89
Done: 90
Done: 91
Done: 92
Done: 93
error
Done: 95
Done: 96
Done: 97
Done: 98
error
Done: 100
Done: 101
Done: 102
Done: 103
Done: 104
Done: 105
Done: 106
Done: 107
Done: 108
Done: 109
Done: 110
error
Done: 112
Done: 113
D

In [10]:
DFC = pd.read_parquet("../../support/newCAO_classified_lv1.parquet.gzip")
DFC.groupby(["category"]).name.count()

category
Interactions    6536
Society         5068
Structure       7596
Name: name, dtype: int64

In [11]:
OVERARCH = "You are a worldwide epidemiologist. You are studying the spread of contagious diseases in green building neighbourhoods (for example of sars-cov-2, which is the COVID)."

In [12]:
DFC = pd.read_parquet("../../support/newCAO_classified_lv1.parquet.gzip")
CATs = list(DFC.category.unique())
if "subcategory" not in DFC.columns:
    DFC["subcategory"] = None
if "confidence_lv2" not in DFC.columns:
    DFC["confidence_lv2"] = None
for CAT in CATs:
    #print("Doing",CAT)
    df = DFC[DFC.category == CAT]
    DONE = list(df[~df.subcategory.isna()].name.unique())
    df = df[~df.name.isin(DONE)]
    subtechs = DICTS_CAO[CAT] # classify_technology(CategoriesOfTechs)
    #print(subtechs)
    newRisks = list(df.name)
    RES = []
    NChunks = 50

    if len(newRisks):
        #print(len(newRisks))
        newRisks = [str(x) for x in newRisks]
        NB = len(newRisks)//NChunks +1
        for k in range(NB)[:NBCAP]:
            try:
                QUESTION =  "The risks are:\n\n* "+"\n* ".join(newRisks[(k*NChunks):(k+1)*NChunks]) 
                prefix ="CAO_risks_"
                overwrite = False
                REQ = {
                    "context": OVERARCH+"\nOut the following list, classify the risks from a high level perspective.\n\n",
                    "question": QUESTION,
                    "functions": classifFwk_step1(subtechs),
                    "token": TOK3N,
                    "overwrite": overwrite,
                    "source": "local-evalTechsT3.5",
                    "seed" : ""
                }

                H = h.hashme(REQ["context"]+REQ["question"])
                cached = "cache/"+prefix+"_"+H+".json"
                if not os.path.isfile(cached) or overwrite:
                    x = requests.post(URL+"fct/", json = REQ)
                    answer = json.loads(x.text)["messages"][-1]
                    h.svt(cached,answer)
                else:
                    answer = h.ldt(cached)
                d = pd.DataFrame(json.loads(answer["function_call"]["arguments"])["sorted_risks"])
                d = d [d["category"].isin(subtechs)]
                d.columns = ["name","subcategory","confidence_lv2"]
                for ix, row in d.iterrows():
                    DFC.loc[DFC.name == row["name"], "subcategory"] = row["subcategory"]
                    DFC.loc[DFC.name == row["name"], "confidence_lv2"] = row["confidence_lv2"]
                DFC.to_parquet("../../support/newCAO_classified_lv1.parquet.gzip",compression="gzip")
                print("Done:",k)
            except:
                print("error")
                pass

Done: 0
Done: 1
Done: 2
Done: 3
Done: 4
Done: 5
Done: 6
Done: 7
error
Done: 9
Done: 10
Done: 11
Done: 12
Done: 13
Done: 14
Done: 15
Done: 16
Done: 17
Done: 18
Done: 19
error
error
Done: 22
Done: 23
Done: 24
Done: 25
Done: 26
Done: 27
Done: 28
Done: 29
Done: 30
error
error
Done: 33
Done: 34
Done: 35
Done: 36
Done: 37
Done: 38
Done: 39
Done: 40
Done: 41
Done: 42
Done: 43
Done: 44
Done: 45
Done: 46
Done: 47
error
Done: 49
error
Done: 51
Done: 52
Done: 53
Done: 54
Done: 55
Done: 56
Done: 57
Done: 58
Done: 59
Done: 60
Done: 61
Done: 62
Done: 63
Done: 64
Done: 65
Done: 66
Done: 67
Done: 68
Done: 69
Done: 70
Done: 71
Done: 72
Done: 73
Done: 74
Done: 75
Done: 76
Done: 77
Done: 78
Done: 79
Done: 80
Done: 81
Done: 82
Done: 83
Done: 84
Done: 85
Done: 86
Done: 87
Done: 88
Done: 89
Done: 90
Done: 91
Done: 92
Done: 93
Done: 94
Done: 95
Done: 96
Done: 97
Done: 98
Done: 99
Done: 100
Done: 101
Done: 102
Done: 103
Done: 104
Done: 105
Done: 106
Done: 107
Done: 108
Done: 109
Done: 110
Done: 111
Done: 112


In [13]:
import pandas as pd
DFC = pd.read_parquet("../../support/newCAO_classified_lv1.parquet.gzip")

In [14]:
DFC[~DFC.subcategory.isna()].groupby(["category","subcategory"]).name.count().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,name
category,subcategory,Unnamed: 2_level_1
Interactions,Arts,32
Interactions,Business,1
Interactions,Commerce/Trade,172
Interactions,Competitiveness,9
Interactions,Culture/Diversity,28
Interactions,Data in/out,142
Interactions,Education,90
Interactions,Entrepreneurship,12
Interactions,Finances,31
Interactions,Health,4346
