In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

# Load the dataset
office_annual_data = pd.read_csv('path_to_your_file/Office Annual CSV.csv')

# Filter for CBSA 47900 and convert 'Period' to datetime
office_annual_data = office_annual_data[office_annual_data['CBSA Code'] == 47900]
office_annual_data['Period'] = pd.to_datetime(office_annual_data['Period'].str.replace(' Q', '-Q'))

# Set 'Period' as index and sort by date
office_annual_data.set_index('Period', inplace=True)
office_annual_data.sort_index(inplace=True)

# Clean the data by dropping columns and rows with excessive missing values
office_annual_data.dropna(axis=1, thresh=int(0.5*len(office_annual_data)), inplace=True)
office_annual_data.dropna(thresh=10, inplace=True)  # Adjust based on dataset specifics

# Feature Engineering
office_annual_data['Economic Cycle Phase'] = np.sin(2 * np.pi * office_annual_data.index.year / 10)
office_annual_data['Rent_Vacancy_Interaction'] = office_annual_data['Market Rent/SF'] * office_annual_data['Vacancy Rate']

# Preparing the data for modeling
features = ['Market Rent/SF', 'Vacancy Rate', 'Economic Cycle Phase', 'Rent_Vacancy_Interaction']
target = 'Market Sale Price Per SF'

# Aligning and splitting the dataset
X = office_annual_data[features].dropna()
y = office_annual_data.loc[X.index, target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
model_gbr.fit(X_train, y_train)

# Model evaluation
predictions_gbr = model_gbr.predict(X_test)
mse_gbr = mean_squared_error(y_test, predictions_gbr)
print(f'Gradient Boosting Regressor MSE: {mse_gbr:.2f}')

# EDA: Seasonal pattern analysis
office_annual_data['Month'] = office_annual_data.index.month
monthly_avg_prices = office_annual_data.groupby('Month')['Market Sale Price Per SF'].mean()

plt.figure(figsize=(10, 6))
sns.lineplot(x=monthly_avg_prices.index, y=monthly_avg_prices.values)
plt.title('Seasonal Pattern of Market Sale Price Per SF')
plt.xlabel('Month')
plt.ylabel('Average Market Sale Price Per SF ($)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()

# Note: Before running this script, replace 'path_to_your_file/Office Annual CSV.csv' with the actual path to your dataset.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Load the dataset
office_annual_data = pd.read_csv('/Users/owenkadis/Downloads/Office Annual CSV.csv')

# Filter for CBSA 47900 and convert 'Period' to datetime
office_annual_data = office_annual_data[office_annual_data['CBSA Code'] == 47900]
office_annual_data['Period'] = pd.to_datetime(office_annual_data['Period'].str.replace(' Q', '-Q'))

# Set 'Period' as index and sort by date
office_annual_data.set_index('Period', inplace=True)
office_annual_data.sort_index(inplace=True)

# Clean the data by dropping columns and rows with excessive missing values
office_annual_data.dropna(axis=1, thresh=int(0.5*len(office_annual_data)), inplace=True)
office_annual_data.dropna(thresh=10, inplace=True)  # Adjust based on dataset specifics


In [3]:
# Feature Engineering
office_annual_data['Economic Cycle Phase'] = np.sin(2 * np.pi * office_annual_data.index.year / 10)
office_annual_data['Rent_Vacancy_Interaction'] = office_annual_data['Market Rent/SF'] * office_annual_data['Vacancy Rate']

# Preparing the data for modeling
features = ['Market Rent/SF', 'Vacancy Rate', 'Economic Cycle Phase', 'Rent_Vacancy_Interaction']
target = 'Market Sale Price Per SF'

# Aligning and splitting the dataset
X = office_annual_data[features].dropna()
y = office_annual_data.loc[X.index, target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ValueError: Found input variables with inconsistent numbers of samples: [3822, 434126]

In [4]:
# Model training with Gradient Boosting Regressor
model_gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
model_gbr.fit(X_train, y_train)

# Model evaluation
predictions_gbr = model_gbr.predict(X_test)
mse_gbr = mean_squared_error(y_test, predictions_gbr)
print(f'Gradient Boosting Regressor MSE: {mse_gbr:.2f}')

# EDA: Seasonal pattern analysis
office_annual_data['Month'] = office_annual_data.index.month
monthly_avg_prices = office_annual_data.groupby('Month')['Market Sale Price Per SF'].mean()

plt.figure(figsize=(10, 6))
sns.lineplot(x=monthly_avg_prices.index, y=monthly_avg_prices.values)
plt.title('Seasonal Pattern of Market Sale Price Per SF')
plt.xlabel('Month')
plt.ylabel('Average Market Sale Price Per SF ($)')
plt.xticks(ticks=np.arange(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True)
plt.show()

# Note: Before running this script, replace 'path_to_your_file/Office Annual CSV.csv' with the actual path to your dataset.


NameError: name 'X_train' is not defined