# Vantager Technical Assessment



In [1]:
import csv
import re

import pandas as pd

from openai import OpenAI

api_key = "OPENAI_API_KEY"
client = OpenAI(api_key=api_key)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was too old on your system - pyarrow 10.0.1 is the current minimum supported version as of this release.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Data loading

In [4]:
# load in the haystack and 
file_path = 'input-datasets/haystack.txt'

with open(file_path, 'r') as file:
    haystack = file.read()

### Provided Pydantic schemas

In [5]:
from typing import Optional
from pydantic import BaseModel, Field

class TechCompany(BaseModel):
    name: Optional[str] = Field(default=None, description="The full name of the technology company")
    location: Optional[str] = Field(default=None, description="City and country where the company is headquartered")
    employee_count: Optional[int] = Field(default=None, description="Total number of employees")
    founding_year: Optional[int] = Field(default=None, description="Year the company was established")
    is_public: Optional[bool] = Field(default=None, description="Whether the company is publicly traded (True) or privately held (False)")
    valuation: Optional[float] = Field(default=None, description="Company's valuation in billions of dollars")
    primary_focus: Optional[str] = Field(default=None, description="Main area of technology or industry the company focuses on")

### Task

We accomplished the task in three key steps:

- Keyword Matching: We filtered the raw data to create a subset of relevant documents, avoiding the need to process the entire dataset.

- Transformation: The filtered subset contains unstructured data, which we transformed into structured data using various NLP tools. In this phase, we employed a LLM, i.e. OpenAI gpt-3.5-turbo, for the transformation.

- Parsing: We used regular expressions to extract the specific company information from the structured data.

Note that, the provided question does not clearly define the purpose of the example_needles argument. This argument could potentially be utilized in two ways. First, it may be used to construct sets of keywords for the keyword matching process. Alternatively, it could serve as part of the system or user prompts to provide relevant context to the LLM. In the current implementation, we set the example_needles to None. 

In [8]:
from typing import List, Type, TypeVar
from pydantic import BaseModel

T = TypeVar('T', bound=BaseModel)

def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    
    # Split the content into paragraphs
    filtered_paragraphs = []
    paragraphs = haystack.split('\n\n')
        
    # We can use the example_needles to derive keywords
    # Then we apply the keyword matching      
    keywords = ['$', 'billion']
    filtered_content = []
    for paragraph in paragraphs:
        if all(keyword.lower() in paragraph.lower() for keyword in keywords):
            filtered_content.append(paragraph.strip())
    filtered_content = [string.replace('\n', ' ') for string in filtered_content]
    
    # Use LLM to transform the unstructed data into structured data
    system_prompt = (
    """
    You are an information extraction assistant. Your task is to read through a list of sentences describing technology companies and extract the following details from each sentence:
    
    name: The name of the company.
    location: The full location of the company, including the city and planet (or any specified place).
    employee_count: The total number of employees working for the company.
    founding_year: The year the company was founded.
    is_public: A boolean value indicating whether the company is public or private (True for public, False for private).
    valuation: The company's valuation in billions of dollars.
    primary_focus: A brief description of the company’s primary area of focus or specialization.
    
    For each company, ensure that the data is structured clearly, even if some information is missing. Use 'unknown' for any fields where information is not provided. Return the extracted information as a list of dictionaries, where each dictionary corresponds to a company and contains the extracted details.
    """
    )
    user_input = "\n".join(filtered_content) 
    
    # Create the completion request
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_input}
        ],
        temperature=0
    )
    llm_transform = completion.choices[0].message.content
    
    # Use regular expression to match the fields in the haystack, allowing for 'unknown' values
    extracted_needles = []

    company_pattern = re.compile(
        r"\{\s*"
        r"'name':\s*'(?P<name>[^']+|\'unknown\')',\s*"
        r"'location':\s*'(?P<location>[^']+|\'unknown\')',\s*"
        r"'employee_count':\s*(?P<employee_count>\d+|\'unknown\'),\s*"
        r"'founding_year':\s*(?P<founding_year>\d+|\'unknown\'),\s*"
        r"'is_public':\s*(?P<is_public>True|False|\'unknown\'),\s*"
        r"'valuation':\s*(?P<valuation>[\d.]+|\'unknown\'),\s*"
        r"'primary_focus':\s*'(?P<primary_focus>[^']+|\'unknown\')'\s*"
        r"\}"
    )

    # Iterate over all matches in the transformed structured data
    for i, match in enumerate(company_pattern.finditer(llm_transform)):
        data = {
            "name": match.group('name') if match.group('name') != "'unknown'" else None,
            "location": match.group('location') if match.group('location') != "'unknown'" else None,
            "employee_count": int(match.group('employee_count')) if match.group('employee_count') != "'unknown'" else None,
            "founding_year": int(match.group('founding_year')) if match.group('founding_year') != "'unknown'" else None,
            "is_public": match.group('is_public') == 'True' if match.group('is_public') != "'unknown'" else None,
            "valuation": float(match.group('valuation')) if match.group('valuation') != "'unknown'" else None,
            "primary_focus": match.group('primary_focus') if match.group('primary_focus') != "'unknown'" else None
        }

        # Create an instance of the schema with the extracted data
        needle_instance = schema(**data)
        extracted_needles.append(needle_instance)

    return extracted_needles

In [9]:
companies = extract_multi_needle(TechCompany, haystack, None)

### Prepare the dataframe 

In [10]:
df = pd.DataFrame([company.dict() for company in companies])
df = df.astype({
    'name': 'string',
    'location': 'string',
    'employee_count': 'Int64', 
    'founding_year': 'Int64',  
    'is_public': 'boolean',  
    'valuation': 'float64',
    'primary_focus': 'string'
})

We identified 22 needles within the provided haystack and organized the results into a data frame. While we permitted certain fields to be left empty, one key assumption is that valuation information exists for all needles.

In [12]:
df

Unnamed: 0,name,location,employee_count,founding_year,is_public,valuation,primary_focus
0,TetraSol,"Helios, Titan",4100.0,2080.0,True,8.3,solar energy solutions
1,ChronosTech,"New Shanghai, Earth",2800.0,2077.0,True,6.2,time-manipulation devices
2,Cyberion Systems,"Olympus Mons, Mars",6700.0,2050.0,True,12.9,quantum networking
3,Quantum Forge,"Orion City, Earth",12500.0,2030.0,True,15.4,quantum computing advancements
4,AstraCom,"Hyperion City, Jupiter",7800.0,2075.0,True,5.6,deep-space communication systems
5,Vortex AI,"Neo London, Earth",1100.0,2038.0,False,5.4,predictive algorithms for financial markets
6,Ryoshi,"Neo Tokyo, Japan",1200.0,2031.0,False,8.7,quantum cryptography
7,Galactica Energy,"Ceres Station, Asteroid Belt",3600.0,2062.0,False,9.2,fusion power generation
8,NeuraNet,"Atlantis City, Pacific Ocean",950.0,2022.0,False,2.6,neural interface technologies
9,Photonix Labs,"Solaris Base, Mercury",2100.0,2060.0,False,7.2,advanced photon-based computing systems


### Exporting the company list

In [13]:
csv_file_path = 'companies.csv'
df.to_csv(csv_file_path, index=False)