In [1]:
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import os

In [2]:
# Check if output directory exists, if not, create it
output_directory = '../Charts and Graphs Output'
os.makedirs(output_directory, exist_ok=True)

In [3]:
# Load dataset
df = pd.read_csv('../Data File Repository/Combined_Rental_Median_Price_Inventory.csv')


In [4]:
# Preprocess data: Melt and create a year column
df = df.melt(id_vars=['Borough'], var_name='Year_Rentals', value_name='Value')
df[['Year', 'Type']] = df['Year_Rentals'].str.extract('(\d{4}) (Total Rentals|Median Rent)')
df.drop('Year_Rentals', axis=1, inplace=True)
df['Year'] = pd.to_numeric(df['Year'])
df['Value'] = pd.to_numeric(df['Value'].replace('[\$,]', '', regex=True), errors='coerce').fillna(0)


In [5]:
# Replace or remove any zero or negative values before taking log
df = df[df['Value'] > 0]


In [6]:
# Separate into a DataFrame for Total Rentals
df_total_rentals = df[df['Type'] == 'Total Rentals'].drop('Type', axis=1)


In [7]:
# Start a new figure for combined chart
fig, ax = plt.subplots(figsize=(12, 8))

colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black']  # Example color list for differentiation


In [8]:
# Iterate over each borough to add to the combined chart
for index, borough in enumerate(df_total_rentals['Borough'].unique()):
    borough_inventory = df_total_rentals[df_total_rentals['Borough'] == borough]
    
    # Ensure data for regression has no infinity or NaN values
    borough_data_for_regression = borough_inventory[borough_inventory['Year'] <= 2019].dropna(subset=['Value'])
    
    X = borough_data_for_regression[['Year']]
    y = np.log(borough_data_for_regression['Value'])  # Taking log of 'Value'

    model = LinearRegression()
    model.fit(X, y)

    X_predict = pd.DataFrame({'Year': range(2010, 2024)})
    y_predict = model.predict(X_predict)
    y_predict_exp = np.exp(y_predict)  # Converting back from log scale

    # Plotting trend line
    ax.plot(X_predict['Year'], y_predict_exp, color=colors[index % len(colors)], linestyle='-', linewidth=2, label=f'Projected Inventory ({borough})')
    
    # Plotting actual data points
    actual_data = borough_inventory[borough_inventory['Year'] <= 2023]  # All actual data up to 2023
    ax.scatter(actual_data['Year'], actual_data['Value'], color=colors[index % len(colors)], alpha=0.6, edgecolor='black', s=50, label=f'Actual Inventory ({borough})')

ax.set_yscale('log')

ax.set_title('Total Rental Inventory Across Boroughs (Projection From Pre-COVID and Actual)')
ax.set_xlabel('Year')
ax.set_ylabel('Total Rentals')

# Create a legend with no duplicate labels
handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(by_label.values(), by_label.keys(), loc='upper left', bbox_to_anchor=(1, 1))

plt.savefig(f"{output_directory}/Combined_Boroughs_Total_Rentals_Chart.png", bbox_inches='tight')

# Close the figure to free memory
plt.close(fig)