In [4]:
import pandas as pd

# Load data
data = pd.read_csv('house_pricing.csv')

# Extract feature names
feature_names = data.columns.tolist()
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
import openai
import dotenv

# Load environment variables
dotenv.load_dotenv()

# Set up API key
openai.api_key = dotenv.get_key('.env', 'OPENAI_API_KEY')

# Function to parse the query using LLM
def parse_query(query):
    messages = [
        {"role": "system", "content": "You are a helpful assistant who extracts and formats feature values from queries about house prices."},
        {"role": "user", "content": f"Extract the values for the following features from the query: {', '.join(feature_names)}. Query: '{query}'"}
    ]
    
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the specific model you are working with
        messages=messages,
        max_tokens=150
    )
    
    return response.choices[0].message['content'].strip()


In [7]:
# Load description file
with open('data_description.txt', 'r') as file:
    description = file.read()

# Incorporate description in the context if needed
def get_description_context():
    return description


In [10]:
def handle_query(query):
    # Extract feature values using LLM
    extracted_values = parse_query(query)
    
    # Perform analysis or insights extraction
    # Here you might want to map extracted values to your data
    # and perform some analysis to provide insights about SalesPrice
    
    # Example: Print extracted values
    print(f"Extracted Values: {extracted_values}")

# Example usage
user_query = "What are the expected SalesPrice for houses in the Neighborhood of Sawyer"
handle_query(user_query)

Extracted Values: Based on the provided query, here are the extracted feature values:

- **Id**: Not provided
- **MSSubClass**: Not provided
- **MSZoning**: Not provided
- **LotFrontage**: Not provided
- **LotArea**: Not provided
- **Street**: Not provided
- **Alley**: Not provided
- **LotShape**: Not provided
- **LandContour**: Not provided
- **Utilities**: Not provided
- **LotConfig**: Not provided
- **LandSlope**: Not provided
- **Neighborhood**: Sawyer
- **Condition1**: Not provided
- **Condition2**: Not provided
- **Bld


In [18]:
# Function to parse the query using LLM
def parse_query(query):
    messages = [
        {"role": "system", "content": "You are a helpful assistant who extracts and formats feature values from queries about house prices."},
        {"role": "user", "content": f"Extract the values for the following features from the query: {', '.join(feature_names)}. Query: '{query}'"}
    ]
    
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  # Use the specific model you are working with
        messages=messages,
        max_tokens=150
    )
    
    result = response.choices[0].message['content'].strip()
    print(f"Extracted Values: {result}")  # Debugging statement
    return result

def clean_extracted_values(extracted_values):
    features = {}
    for line in extracted_values.split('\n'):
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip().lstrip('- ')  # Remove any leading dashes or spaces
            features[key] = value.strip()
    
    print(f"Cleaned Features for Filtering: {features}")  # Debugging statement
    return features

def filter_data(features):
    filtered_data = data.copy()
    for feature, value in features.items():
        if feature in filtered_data.columns:
            filtered_data = filtered_data[filtered_data[feature] == value]
        else:
            print(f"Feature '{feature}' not found in dataset columns.")
    
    print(f"Filtered Data Shape: {filtered_data.shape}")  # Debugging statement
    return filtered_data

def estimate_sales_price(filtered_data):
    if 'SalePrice' not in filtered_data.columns:
        return "The 'SalePrice' column is missing in the filtered data."
    
    if not filtered_data.empty:
        average_price = filtered_data['SalePrice'].mean()
        return average_price
    else:
        return "No data available for the given features."

def handle_query(query):
    # Extract feature values using LLM
    extracted_values = parse_query(query)
    
    # Clean and parse feature values
    features = clean_extracted_values(extracted_values)
    
    # Filter the data based on the cleaned feature values
    filtered_data = filter_data(features)
    
    # Estimate the SalesPrice
    estimated_price = estimate_sales_price(filtered_data)
    
    # Print the result
    print(f"Estimated SalesPrice: {estimated_price}")

# Example usage
user_query = "What are the expected SalesPrice for houses with an OverallQual of 9?"
handle_query(user_query)

Extracted Values: Here are the extracted values based on the query provided:

- Id: Not specified
- MSSubClass: Not specified
- MSZoning: Not specified
- LotFrontage: Not specified
- LotArea: Not specified
- Street: Not specified
- Alley: Not specified
- LotShape: Not specified
- LandContour: Not specified
- Utilities: Not specified
- LotConfig: Not specified
- LandSlope: Not specified
- Neighborhood: Not specified
- Condition1: Not specified
- Condition2: Not specified
- BldgType: Not specified
- HouseStyle: Not specified
- OverallQual: 9
- OverallCond: Not specified
- YearBuilt: Not specified
Cleaned Features for Filtering: {'Here are the extracted values based on the query provided': '', 'Id': 'Not specified', 'MSSubClass': 'Not specified', 'MSZoning': 'Not specified', 'LotFrontage': 'Not specified', 'LotArea': 'Not specified', 'Street': 'Not specified', 'Alley': 'Not specified', 'LotShape': 'Not specified', 'LandContour': 'Not specified', 'Utilities': 'Not specified', 'LotConfig': 