In [None]:
import os
from typing import List

import orjson
from dotenv import load_dotenv
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
llm = OpenAI(model_name="text-davinci-003", openai_api_key=OPENAI_API_KEY)
llm

In [None]:
locations = [
    "Neuchâtel",
    "Josefstrasse 219, Zurich",
    "Zurich",
    "EDF R&D Renardières",
    "Lausanne, EPFL Innovation Park",
    "Fribourg",
    "Renens",
    "Antony, France",
    "Genève",
    "Le Bourget du Lac (France,73)",
    "Swisscom Digital Lab, EPFL Innovation Park, Bat F, 1015 Lausanne",
    "Zürich, Nyon or Homeoffice",
    "Oracle Labs Zurich, Switzerland (other locations or work from home available upon agreement)",
    "Geneve",
    "remote in Switzerland (we also have an office in Zürich)",
    "Zürich",
    "Chilly-Mazarin",
    "Sophia Antipolis - France",
    "CERN/Geneva",
    ":NANTERRE",
    "Versoix",
    "Palaiseau, FR",
    "Hybrid",
    "GRENOBLE (France)",
    "Sachseln, OW",
    "Europe",
    "Nyon",
    "RENENS VD",
    "Vers-chez-les-blanc, VD",
    "Neuchâtel or Bern",
    "Cambridge, MA, USA",
    "Princeton, NJ, USA",
    "Milano",
    "Lausanne/Geneva",
    "E-Scopics, Aix-en-Provence, France",
    "Martigny",
    "The student will have the opportunity to work in a stimulating environment with other students in different locations (Lausanne, Bern, Zurich) in Switzerland depending on the needs of the projects.",
    "Hinwil, Switzerland",
    "Lyon, 69009 France",
    "Payerne",
    "Lausanne / Geneva",
    "CROLLES",
    "France, PACA, 13 - Istres",
    "Aix en Provence, France",
    "Lausanne-Prilly",
    "LIRIS Lab and INRIA  (Lyon)"
]

In [None]:
class Location(BaseModel):
    city: str = Field(description="city of a location")
    country: str = Field(description="country of a location")

class LocationDict(BaseModel):
    locations: dict[str, List[Location]] = Field(description="dictionary of lists of locations with the original text as key")

# location_query = "Extract the city and the country from a location in a json format."
location_query = """I have a list of locations but it is really badly designed.
I want you to extract the city and the country from a location in a json format.
I don't want a zipcode. Only city and country. Cities and countries should only be strings.
Countries should not be acronyms: for example "USA" should be change to "United States".
Infer the country if needed.
"""

parser = PydanticOutputParser(pydantic_object=LocationDict)

print(parser.get_format_instructions())

In [None]:
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\nFormat the following locations:\n{locations}\n",
    input_variables=["locations"],
    partial_variables={"format_instructions": parser.get_format_instructions(), "query": location_query},
)

_input = prompt.format_prompt(locations=locations)
print(_input.to_string())

In [None]:
output = llm(_input.to_string(), max_tokens=2000)
print(output)

In [None]:
data = parser.parse(output)
data

In [None]:
orjson.loads(data.json())

In [None]:
for value in data.locations.values():
    for location in value:
        if location.city == "Zürich":
            location.city = "Zurich"

In [None]:
import json

with open('test.json', 'w') as f:
    json.dump(orjson.loads(data.json()), f, indent=2, ensure_ascii=False)
