In [6]:
# These are all needed imports for this part of the project to run
import pandas as pd
import json as js
from openai import OpenAI

In [3]:
# This cell imports the scraped data from the main part & reduces it to the most important columns
df = pd.read_csv('data/immoscout24_zh_all_20240402.csv', sep=';')
df_reduced = df[['Title', 'Description', 'Address', 'Rooms', 'SquareMeters', 'CoreAttributes', 'Features']]

In [4]:
# Within this cell you can test the chat completions with the GPT-3.5 model
client = OpenAI()

testRow = 15

completion = client.chat.completions.create(
  model="gpt-3.5-turbo-0125",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a pricing assistant for swiss appartment rentals and are always to output JSON. You are given the following tasks: Estimate and only output the price per month based on the details given. Always output the price as only a number readable as an int which should always be in the currency CHF. Key to be used = price_per_month"},
    {"role": "user", "content": "Please predict the price based on these details:" + df_reduced.iloc[testRow].to_json()}
  ]
)

print(completion.choices[0].message.content)

{
    "price_per_month": 2800
}


In [59]:
# Parse JSON data
data = js.loads(completion.choices[0].message.content)

# Accessing the value
price_per_month = data["price_per_month"]
print("Price Per Month:", price_per_month)

Price Per Month: 3500


In [5]:
df.rename(columns={'Price': 'ActualPrice'}, inplace=True)
df['PredictedPrice'] = None

client = OpenAI()

for i in range(len(df_reduced)):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "You are a pricing assistant for swiss appartment rentals and are always to output JSON. You are given the following tasks: Estimate and only output the price per month based on the details given. Always output the price as only a number readable as an int which should always be in the currency CHF. Key to be used = price_per_month"},
            {"role": "user", "content": "Please predict the price based on these details:" + df_reduced.iloc[i].to_json()}
        ]
    )

    df.at[i, 'PredictedPrice'] = js.loads(completion.choices[0].message.content)

In [7]:
df.to_csv('data/immoscout24_openAI_3.5_untrained_V2.csv', index=False, sep=';')

In [8]:
df_predicted = df[['ActualPrice', 'PredictedPrice']]

df_predicted.head()

Unnamed: 0,ActualPrice,PredictedPrice
0,CHF 2’570.–,{'price_per_month': 2800}
1,CHF 1’921.–,{'price_per_month': 1700}
2,CHF 2’830.–,{'price_per_month': 3500}
3,CHF 2’950.–,{'price_per_month': 3000}
4,CHF 2’950.–,{'price_per_month': 3000}


In [13]:
df_predicted = df[['ActualPrice', 'PredictedPrice']]

def extract_and_convert_price(price_data):
    # Determine the correct price string based on data type
    if isinstance(price_data, dict):
        # Extracting price from dictionary with different possible keys
        key_variations = ['PricePerMonth', 'Price per month', 'price_per_month']
        for key in key_variations:
            if key in price_data:
                price_str = str(price_data[key])  # Convert to string to ensure compatibility
                break
        else:
            return None  # If none of the keys are found
    else:
        price_str = str(price_data)  # Convert to string if not a dictionary

    # Clean the price string by removing non-digit characters
    cleaned_price = ''.join([char for char in price_str if char.isdigit()])
    return int(cleaned_price) if cleaned_price else None  

# Apply the updated function to the DataFrame columns
df_predicted['ActualPrice'] = df_predicted['ActualPrice'].apply(extract_and_convert_price)
df_predicted['PredictedPrice'] = df_predicted['PredictedPrice'].apply(extract_and_convert_price)

df_predicted.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predicted['ActualPrice'] = df_predicted['ActualPrice'].apply(extract_and_convert_price)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predicted['PredictedPrice'] = df_predicted['PredictedPrice'].apply(extract_and_convert_price)


Unnamed: 0,ActualPrice,PredictedPrice
0,2570.0,2800
1,1921.0,1700
2,2830.0,3500
3,2950.0,3000
4,2950.0,3000


In [32]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r_squared = r2_score(df_predicted['ActualPrice'], df_predicted['PredictedPrice'])
mse = mean_squared_error(df_predicted['ActualPrice'], df_predicted['PredictedPrice'])
mae = mean_absolute_error(df_predicted['ActualPrice'], df_predicted['PredictedPrice'])

print(f'R^2 Score: {r_squared}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')


R^2 Score: 0.6834336633716507
Mean Squared Error: 7242880.472081218
Mean Absolute Error: 820.0091370558375


In [31]:
df_predicted = df_predicted.dropna()
df_predicted.isna().sum()

ActualPrice       0
PredictedPrice    0
dtype: int64

In [21]:
df_predicted.iloc[764]

ActualPrice       2415.0
PredictedPrice    2550.0
Name: 764, dtype: float64