In [2]:
import openml
import openai
from openai import OpenAI
import backoff
import spacy
import warnings
import json
import pickle
warnings.simplefilter(action='ignore')


!python -m spacy download en_core_web_lg # Using word vectors using a large model

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
# Gather all OpenML datasets for semantic tagging
all_datasets = openml.datasets.list_datasets(output_format="dataframe")

# List dataset 'did' to be used as an identifier 
data_id = []
for i in range(len(all_datasets)):
  data_id.append(all_datasets.iloc[i]['did'])

# dictonary to hold {'did': dataset_decription}
all_data_description = dict.fromkeys(data_id, "") 


for i in range(len(all_datasets)):
  dataset_name = all_datasets.iloc[i]['name']
  try:
    data = openml.datasets.get_dataset(dataset_name, download_data = False, download_features_meta_data = False, download_qualities = False)
    all_data_description[all_datasets.iloc[i]['did']] = data.description
  except:
    continue

print(len(all_data_description))

# # # Save data description as pickle file
# # with open('dataset_description.pickle', 'wb') as f:
# #     pickle.dump(all_data_description, f, protocol=pickle.HIGHEST_PROTOCOL)


In [4]:
# Alternatvely all openml dataset description saved in a file (dataset_description.pickle), can be used here

# Load data decription 
infile = open('data/dataset_description.pickle','rb')
all_data_description = pickle.load(infile)     

In [5]:
# List of tags

tags = [
"Agriculture",
"Astronomy",
"Chemistry",
"Computational Universe",
"Computer Systems",
"Culture",
"Demographics",
"Earth Science",
"Economics",
"Education",
"Geography",
"Government",
"Health",
"History",
"Human Activities",
"Images",
"Language",
"Life Science",
"Machine Learning",
"Manufacturing",
"Mathematics",
"Medicine",
"Meteorology",
"Physical Sciences",
"Politics",
"Social Media",
"Sociology",
"Statistics",
"Text & Literature",
"Transportation"]

In [6]:
"""
Prompting GPT-3.5-turbo for picking semantic tag from the given list of tags, given dataset description.
Exponential backoff strategy used when prompting. Can use other strategies to speed up (more information - https://backoff-utils.readthedocs.io/en/latest/using.html)
"""

# set up OpenAI API key
client = OpenAI(api_key = " ")

MAX_RETRIES = 5

@backoff.on_exception(backoff.expo,
 (openai.InternalServerError, openai.OpenAIError, openai.APIStatusError, openai.RateLimitError, openai.APIError, openai.APIConnectionError, openai.Timeout, openai.APIResponseValidationError),
                      max_tries = MAX_RETRIES,
                      on_backoff=lambda details: print(f"Retrying in {details} seconds..."),
                      on_giveup=lambda e: print(f"Max retries reached. Unable to complete the request: {e}")
                      )
def completion_using_message(messages, model = "gpt-3.5-turbo"):
  completion = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.5,
            max_tokens=500,
        )
  return completion.choices[0].message.content

In [7]:
# Helper functions


def clean(response):
  """
  clean GPT response
  """
  response = response.strip().replace("Tags",'').replace("Tag", '').strip(":").strip().strip("'").strip("[").strip("]").strip("'").strip()
  response = response.split(",")
  for i in range(len(response)):
    response[i] = response[i].strip().strip("'").strip("'").strip()

  return response


def find_closest_tag(word, tags):
  """
  Fing tag semantically similar to GPT generated word by calculating similarity
  """

  nlp = spacy.load('en_core_web_lg')

  word_token = nlp(word)

  highest_similarity = 0
  semantic_tag = []
  for i in range(len(tags)):
    tag_token = nlp(tags[i])
    simli = word_token.similarity(tag_token)
    if simli > highest_similarity:
      highest_similarity = simli
      semantic_tag = tags[i]

  return semantic_tag


In [None]:
# query GPT one by one for each dataset description to get corresponding semantic tag. If the dataset decsription == '' or None, semantic_tag == ['No description']

output = {} # dictionay with {'did':[semantic tags]}
null_output = [] #  response by GPT == NULL and dataset description != None
exceptions_dict = {} #  error produced by GPT, contains {'did':[Expection]}
no_description = [] #  if dataset description == None

max_token = 4097 # 1 token ~= 4 chars in English, source: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them 

with open('output_files/temp.txt', 'w') as writefile:
  for key, value in all_data_description.items():
    
    semantic_tag = []
    
    if value == None:
      semantic_tag = ["No description"]
      output.update({key:['No description']})
      no_description.append(key)

    else:
      
      try:
        
        # To avoid token limit error, the dataset description is shortened to len = 8000, determined experimentally. 
        if len(value) > 8000: 
          value = value[:8000]

        # GPT Prompt attempt 1
        messages = [
            {
                "role": "system",
                "content": "You are an expert in assigning one or two semantic tags from given list of tags to each given dataset description. You will reply by only picking tags from the given list of tags. Answer as concisely as possible.",
            },
            {
                "role": "user",
                "content": f"Dataset Description: {value}\nTags: {tags}\nTags: ",
            },
        ]

        response = completion_using_message(messages)

        # GPT Prompt attempt 2
        if response == []:
          messages = [
            {
                "role": "system",
                "content": "You are an expert in assigning one semantic tag from given list of tags to each given dataset description. You will reply by only picking one tag from the given list of tags. Only pick tag from the given list of tags. Answer as concisely as possible.",
            },
            {
                "role": "user",
                "content": f"Dataset Description: {value}\nTags: {tags}\n Assign one Tag to the dataset description: ",
            },
        ]
          response = completion_using_message(messages)


        if response != []:
          response = clean(response)
          for r in response:
            r = r.strip()
            if r not in tags:
              word = find_closest_tag(r, tags)
              if word not in semantic_tag and word != []:
                semantic_tag.append(word)
            else:
              semantic_tag.append(r)

        # Each dataset can have a maximum of 2 semantic tags
        if len(semantic_tag) > 2:
          semantic_tag = semantic_tag[:2]

        if semantic_tag == []:
          null_output.append(key)

        print(f"Tag for dataset: {key} is {semantic_tag}")
        output.update({key:semantic_tag})

        break
      except openai.OpenAIError as e:
        exceptions_dict.update({key:e})
        print(f"exception for dataset: {key} is {e}")
        
      except Exception as e:
        exceptions_dict.update({key:e})
        print(f"exception for dataset: {key} is {e}")


  # Save the output as a json object.
  json_object = json.dumps(output, indent=4)

  writefile.write('{}\n'.format(json_object))

