# Vantager online assessment

### Step 1: Keywords Matching 

Find the paragraphs in the DUNE raw text which contain keywords such as company, billion, employee, etc.

In [1]:
from typing import List, Type, TypeVar
from pydantic import BaseModel

from typing import Optional
from pydantic import BaseModel, Field

import re
import pandas as pd
import json
import csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was too old on your system - pyarrow 10.0.1 is the current minimum supported version as of this release.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
file_path = 'input-datasets/haystack.txt'

# Use a context manager to open the file
with open(file_path, 'r') as file:
    contents = file.read()

In [3]:
def filter_paragraphs_by_keywords(content, keywords: List[str]) -> List[str]:
    filtered_paragraphs = []

    # Split the content into paragraphs
    paragraphs = content.split('\n\n')  # Assuming paragraphs are separated by two newlines

    for paragraph in paragraphs:
        if all(keyword.lower() in paragraph.lower() for keyword in keywords):
            filtered_paragraphs.append(paragraph.strip())

    return filtered_paragraphs

In [4]:
#keywords = ['company', 'firm', 'employ', 'worker', 'billion', '$']  # Add your keywords
keywords = ['$', 'billion']
filtered_content = filter_paragraphs_by_keywords(contents, keywords)
filtered_content = [string.replace('\n', ' ') for string in filtered_content]


In [5]:
len(filtered_content)

22

In [6]:
filtered_content

['TetraSol, headquartered in Helios, Titan, has been a public company since 2080, specializing in solar energy solutions, employing 4,100 workers and valued at $8.3 billion.',
 'ChronosTech, located in New Shanghai, Earth, was founded in 2077, employs 2,800 people, and focuses on time-manipulation devices, with a public status and a valuation of $6.2 billion.',
 'Cyberion Systems, a public quantum networking company headquartered in Olympus Mons, Mars, was founded in 2050 and employs 6,700 people with a current valuation of $12.9 billion.',
 'Quantum Forge, a public company located in Orion City, Earth, was founded in 2030 and currently employs 12,500 people, with a valuation of $15.4 billion focused on quantum computing advancements.',
 'AstraCom, based in Hyperion City, Jupiter, is a public telecommunications company founded in 2075, with 7,800 employees and a valuation of $5.6 billion, focusing on deep-space communication systems.',
 'Vortex AI, based in Neo London, Earth, is a priv

### Step 2: Convert the needles into structured data

At this stage, we've identified all the needles in the haystack in their unstructured format. Next, we'll use NLP tools to transform this unstructured data into a structured format for easier parsing, while also filtering out any irrelevant data that are not considered needles.

In [12]:
from openai import OpenAI

api_key = "OPENAI_API_KEY"
client = OpenAI(api_key=api_key)


In [10]:
system_prompt = (
    """
    You are an information extraction assistant. Your task is to read through a list of sentences describing technology companies and extract the following details from each sentence:
    
    name: The name of the company.
    location: The full location of the company, including the city and planet (or any specified place).
    employee_count: The total number of employees working for the company.
    founding_year: The year the company was founded.
    is_public: A boolean value indicating whether the company is public or private (True for public, False for private).
    valuation: The company's valuation in billions of dollars.
    primary_focus: A brief description of the companyâ€™s primary area of focus or specialization.
    
    For each company, ensure that the data is structured clearly, even if some information is missing. Use 'unknown' for any fields where information is not provided. Return the extracted information as a list of dictionaries, where each dictionary corresponds to a company and contains the extracted details.
    """
)

In [11]:
user_input = "\n".join(filtered_content) 

# Create the completion request
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_input}
    ],
    temperature=0
)


In [16]:
llm_transform = completion.choices[0].message.content
print(llm_transform)

```python
[
    {
        'name': 'TetraSol',
        'location': 'Helios, Titan',
        'employee_count': 4100,
        'founding_year': 2080,
        'is_public': True,
        'valuation': 8.3,
        'primary_focus': 'solar energy solutions'
    },
    {
        'name': 'ChronosTech',
        'location': 'New Shanghai, Earth',
        'employee_count': 2800,
        'founding_year': 2077,
        'is_public': True,
        'valuation': 6.2,
        'primary_focus': 'time-manipulation devices'
    },
    {
        'name': 'Cyberion Systems',
        'location': 'Olympus Mons, Mars',
        'employee_count': 6700,
        'founding_year': 2050,
        'is_public': True,
        'valuation': 12.9,
        'primary_focus': 'quantum networking'
    },
    {
        'name': 'Quantum Forge',
        'location': 'Orion City, Earth',
        'employee_count': 12500,
        'founding_year': 2030,
        'is_public': True,
        'valuation': 15.4,
        'primary_focus': 'quantum co

### Step 3: Parsing

With the structured data in place, we can easily extract the required fields and convert them into the appropriate data types.

In [13]:
class TechCompany(BaseModel):
    name: Optional[str] = Field(default=None, description="The full name of the technology company")
    location: Optional[str] = Field(default=None, description="City and country where the company is headquartered")
    employee_count: Optional[int] = Field(default=None, description="Total number of employees")
    founding_year: Optional[int] = Field(default=None, description="Year the company was established")
    is_public: Optional[bool] = Field(default=None, description="Whether the company is publicly traded (True) or privately held (False)")
    valuation: Optional[float] = Field(default=None, description="Company's valuation in billions of dollars")
    primary_focus: Optional[str] = Field(default=None, description="Main area of technology or industry the company focuses on")

In [14]:
T = TypeVar('T', bound=BaseModel)

def extract_multi_needle(schema: Type[T], haystack: str, example_needles: List[str]) -> List[T]:
    extracted_needles = []

    # Regex pattern to match the fields in the haystack, allowing for 'unknown' values
    company_pattern = re.compile(
        r"\{\s*"
        r"'name':\s*'(?P<name>[^']+|\'unknown\')',\s*"
        r"'location':\s*'(?P<location>[^']+|\'unknown\')',\s*"
        r"'employee_count':\s*(?P<employee_count>\d+|\'unknown\'),\s*"
        r"'founding_year':\s*(?P<founding_year>\d+|\'unknown\'),\s*"
        r"'is_public':\s*(?P<is_public>True|False|\'unknown\'),\s*"
        r"'valuation':\s*(?P<valuation>[\d.]+|\'unknown\'),\s*"
        r"'primary_focus':\s*'(?P<primary_focus>[^']+|\'unknown\')'\s*"
        r"\}"
    )

    # Iterate over all matches in the haystack
    for i, match in enumerate(company_pattern.finditer(haystack)):
        data = {
            "name": match.group('name') if match.group('name') != "'unknown'" else None,
            "location": match.group('location') if match.group('location') != "'unknown'" else None,
            "employee_count": int(match.group('employee_count')) if match.group('employee_count') != "'unknown'" else None,
            "founding_year": int(match.group('founding_year')) if match.group('founding_year') != "'unknown'" else None,
            "is_public": match.group('is_public') == 'True' if match.group('is_public') != "'unknown'" else None,
            "valuation": float(match.group('valuation')) if match.group('valuation') != "'unknown'" else None,
            "primary_focus": match.group('primary_focus') if match.group('primary_focus') != "'unknown'" else None
        }

        # Create an instance of the schema with the extracted data
        needle_instance = schema(**data)
        extracted_needles.append(needle_instance)

    return extracted_needles

In [17]:
companies = extract_multi_needle(TechCompany, llm_transform, None)

In [18]:
df = pd.DataFrame([company.dict() for company in companies])
df = df.astype({
    'name': 'string',
    'location': 'string',
    'employee_count': 'Int64', 
    'founding_year': 'Int64',  
    'is_public': 'boolean',  
    'valuation': 'float64',
    'primary_focus': 'string'
})

In [19]:
df

Unnamed: 0,name,location,employee_count,founding_year,is_public,valuation,primary_focus
0,TetraSol,"Helios, Titan",4100.0,2080.0,True,8.3,solar energy solutions
1,ChronosTech,"New Shanghai, Earth",2800.0,2077.0,True,6.2,time-manipulation devices
2,Cyberion Systems,"Olympus Mons, Mars",6700.0,2050.0,True,12.9,quantum networking
3,Quantum Forge,"Orion City, Earth",12500.0,2030.0,True,15.4,quantum computing advancements
4,AstraCom,"Hyperion City, Jupiter",7800.0,2075.0,True,5.6,deep-space communication systems
5,Vortex AI,"Neo London, Earth",1100.0,2038.0,False,5.4,predictive algorithms for financial markets
6,Ryoshi,"Neo Tokyo, Japan",1200.0,2031.0,False,8.7,quantum cryptography
7,Galactica Energy,"Ceres Station, Asteroid Belt",3600.0,2062.0,False,9.2,fusion power generation
8,NeuraNet,"Atlantis City, Pacific Ocean",950.0,2022.0,False,2.6,neural interface technologies
9,Photonix Labs,"Solaris Base, Mercury",2100.0,2060.0,False,7.2,advanced photon-based computing systems
