In [51]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping

In [52]:
np.random.seed(42)

In [53]:
df = pd.read_csv("cbsa_data_final_cleaned.csv")

  df = pd.read_csv("cbsa_data_final_cleaned.csv")


In [54]:
df.columns

Index(['propertyId', 'propertyName', 'zipCode', 'address', 'city', 'state',
       'averageSquareFootage', 'stories', 'stable', 'class', 'latitude',
       'longitude', 'imageHero', 'heroSource', 'submarketName', 'unitcount',
       'yearBuilt', 'siteStatus', 'type', 'MPF-ANN-RENT-CHG',
       'MPF-HIST-ASK-RENT', 'MPF-HIST-AVG-ASK-RPSF', 'MPF-HIST-CONC-RATIO',
       'MPF-OCC', 'MPF-RENT', 'MPF-RPSF', 'data_index', 'cbsa_code', 'status',
       'uses_realpage'],
      dtype='object')

In [55]:
# developing yearsold column (this is just the year the property was built minus the current year)
current_year = datetime.now().year
df['yearsold'] = current_year - df['yearBuilt']

In [56]:
# selecting relevant columns to use in the model
cols = [
    'cbsa_code', 'averageSquareFootage', 'stories', 'class', 'unitcount', 'yearsold', 'MPF-OCC', 'MPF-RPSF', 'uses_realpage'
]
df.dropna(subset=cols,inplace=True)

# Model 1: Predicting the Usage of RealPage

In this section, we'll develop a binary classification model to predict whether a property uses RealPage. The key steps include:

- **Preprocessing & Feature Engineering:**
  - Compute `yearsold` from the `yearBuilt` column (Current year - yearBuilt).  
  - Standardize numerical features using StandardScaler.
  - One-hot encode categorical features (CBSA code and class).
  - Eliminate any NA rows

- **Modeling:**  
  - Use logistic regression to estimate the propensity (probability) that a property uses RealPage.
  - Perform an 80/20 train-test split for model evaluation.

- **Evaluation:**  
  - Generate a classification report with metrics like precision, recall, and F1-score.

Goal: Prompt a user for input variables and make a prediction based on the variables whether it is a realpage user or not


Columns used in model:
<br>  - `cbsa_code`
<br>    - `averageSquareFootage`
<br>    - `stories`
<br>    - `class`
<br>    - `unitcount`
<br>    - `yearsold`
<br>    - `MPF-OCC`
<br>    - `MPF-RPSF`
<br>    - `uses_realpage`

Please refer to the website (https://uc-berkeley-i-school.github.io/realpage-collusion/) for definitions of the columns.


In [57]:
#creating our treatment and control groups
features_treatment = ['averageSquareFootage', 'stories', 'cbsa_code', 'class', 'yearsold', 'MPF-OCC', 'unitcount', 'MPF-RPSF']
X_treatment = df[features_treatment]
y_treatment = df['uses_realpage']

# assigning numeric and categorical features
numeric_features_treatment = ['averageSquareFootage', 'stories', 'yearsold', 'MPF-OCC', 'unitcount', 'MPF-RPSF']
categorical_features_treatment = ['cbsa_code', 'class']

In [58]:
preprocessor_treatment = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_treatment),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_treatment)
    ]
)

In [59]:
clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_treatment),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

In [60]:
X_treat_train, X_treat_test, y_treat_train, y_treat_test = train_test_split(
    X_treatment, y_treatment, test_size=0.2, random_state=42
)

In [61]:
clf_pipeline.fit(X_treat_train, y_treat_train)

In [62]:
y_treat_pred = clf_pipeline.predict(X_treat_test)

In [63]:
print("Treatment Model Classification Report:")
print(classification_report(y_treat_test, y_treat_pred))

Treatment Model Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93      6253
           1       0.49      0.02      0.05       887

    accuracy                           0.88      7140
   macro avg       0.68      0.51      0.49      7140
weighted avg       0.83      0.88      0.82      7140



# Model 2: Predicting the Price of Properties

In this section, our goal is to build a regression model to predict the rent per square foot (MPF-RPSF) of properties. The process involves:

- **Preprocessing & Feature Engineering:**
  - Compute `yearsold` from the `yearBuilt` column (Current year - yearBuilt).  
  - Standardize numerical features using StandardScaler.
  - One-hot encode categorical features (CBSA code and class).
  - Eliminate any NA rows

- **Modeling:**  
  - Implement a Random Forest Regressor for robust prediction.
  - Split the data into an 80/20 train-test set.

- **Evaluation:**  
  - Assess the model using the Mean Squared Error (MSE) metric.

This setup will allow us to test, tweak, and improve both models based on the performance and insights we gain from the evaluation.

Goal: Allow a user to prompt the model and see how the models prediction of price changes as they change the realpage usage variable, while keeping other variables constant.

Columns used in model:
<br>    - `cbsa_code`
<br>    - `averageSquareFootage`
<br>    - `stories`
<br>    - `class`
<br>    - `unitcount`
<br>    - `yearsold`
<br>    - `MPF-OCC`
<br>    - `MPF-RPSF`
<br>    - `uses_realpage`

Please refer to the website (https://uc-berkeley-i-school.github.io/realpage-collusion/) for definitions of the columns.

In [82]:
#defining features and target
features_rent = ['averageSquareFootage', 'stories', 'cbsa_code', 'class', 'yearsold', 'MPF-OCC', 'unitcount','uses_realpage']
X_rent = df[features_rent]
y_rent = df['MPF-RPSF']

#assigning numeric and categorical features
numeric_features_rent = ['averageSquareFootage', 'stories', 'yearsold', 'MPF-OCC', 'unitcount','uses_realpage']
categorical_features_rent = ['cbsa_code', 'class']

In [83]:
#develop a preprocessor for for the rent model
preprocessor_rent = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_rent),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_rent)
    ]
)

In [84]:
#build pipeline for the regression model, we will use random forest regressor. will run through preprossor first, then through the regressor model.
reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_rent),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [85]:
X_rent_train, X_rent_test, y_rent_train, y_rent_test = train_test_split(
    X_rent, y_rent, test_size=0.2, random_state=42
)

In [86]:
reg_pipeline.fit(X_rent_train, y_rent_train)

In [87]:
y_rent_pred = reg_pipeline.predict(X_rent_test)
mse = mean_squared_error(y_rent_test, y_rent_pred)
print("Rent Model Mean Squared Error:", mse)

Rent Model Mean Squared Error: 0.06020848502946076


# Try it Yourself! Predict the RPSF

In this section, we're building an interactive function that allows a user to input key property details. Based on these inputs, our regression model will predict the rent per square foot (RPSF). This serves as a prototype for the similar functionality we plan to integrate into our website.

## What Does the Function Do?

- **User Prompts:**  
  The function asks the user to enter several input variables, including:
  - Average square footage
  - Number of stories
  - CBSA code (categorical)
  - Property class (categorical)
  - Year built (from which we calculate the property's age)
  - MPF-OCC (occupancy)
  - Unit count

- **Output:**  
  The function displays the inputs the user provided and then outputs the prediction in a clear format, e.g.,  
  `Predicted RPSF: $XXXX`

## How This Relates to Our Website

This interactive function is a small-scale prototype of what we plan to implement on our website. On the live site:
- Users can navigate to the Try it Yourself section within the website and test inputs
- The website backend will process these inputs into our model using sagemaker
- The predicted rent per square foot will be displayed to the user immediately.


In [88]:
def predict_rpsf(model):
    """
    Prompts the user for input variables, displays the inputs,
    and outputs the predicted rent per square foot.
    """
    try:
        averageSquareFootage = float(input("Enter average square footage: "))
        stories = float(input("Enter number of stories: "))
        cbsa_code = input("Enter CBSA code: ")
        property_class = input("Enter property class: ")
        yearbuilt = int(input("Enter the year built: "))
        MPF_OCC = float(input("Enter MPF-OCC (occupancy): "))
        unitcount = float(input("Enter unit count: "))
        uses_realpage = input("Enter uses_realpage: ")
    except Exception as e:
        print(f"Error in input: {e}")
        return

    current_year = datetime.now().year
    yearsold = current_year - yearbuilt

    print("\nUser Inputs:")
    print(f"  Average Square Footage: {averageSquareFootage}")
    print(f"  Stories: {stories}")
    print(f"  CBSA Code: {cbsa_code}")
    print(f"  Property Class: {property_class}")
    print(f"  Year Built: {yearbuilt}")
    print(f"  Years Old: {yearsold}")
    print(f"  MPF-OCC (Occupancy): {MPF_OCC}")
    print(f"  Unit Count: {unitcount}")
    print(f"  Uses RealPage: {uses_realpage}\n")
    
    input_df = pd.DataFrame({
        'averageSquareFootage': [averageSquareFootage],
        'stories': [stories],
        'cbsa_code': [cbsa_code],
        'class': [property_class],
        'yearsold': [yearsold],
        'MPF-OCC': [MPF_OCC],
        'unitcount': [unitcount],
        'uses_realpage': [uses_realpage]
    })
    
    prediction = model.predict(input_df)
    predicted_rpsf = prediction[0]
    
    print(f"Predicted RPSF: ${predicted_rpsf:.2f}")
    return predicted_rpsf


In [89]:
predict_rpsf(reg_pipeline)


User Inputs:
  Average Square Footage: 980.0
  Stories: 2.0
  CBSA Code: 10180
  Property Class: A
  Year Built: 2010
  Years Old: 15
  MPF-OCC (Occupancy): 0.974
  Unit Count: 200.0

  Uses RealPage: 1
Predicted RPSF: $1.53


1.5283099999999998

In [None]:
"""
Next steps for me to work on:
-   balance the dataset and retest the model
-   limit to only cbsa's that have realpage users
-   add in market share into model and see how that affects the model
-   try different models
-   do some hyperparameter tuning
-   put in sagemaker and try to hit the model via API
"""