## Gentrification Prediction Model

classify dataset

In [5]:

import pandas as pd

# Load the dataset
data = pd.read_csv("../data/Final_Dataset/final_merged_dataset.csv")

# Calculate thresholds based on percentiles
thresholds = {
    "ZHVF 1-Year Forecast (%)": data["ZHVF 1-Year Forecast (%)"].quantile(0.75),
    "Percent Change": data["Percent Change"].quantile(0.75),
    "Mean Income": data["Mean Income"].quantile(0.75),
    "luxury_business_count": data["luxury_business_count"].quantile(0.75),
    "SizeRank_lower": data["SizeRank"].quantile(0.45),
    "SizeRank_upper": data["SizeRank"].quantile(0.55)
}

# Function to count criteria met for gentrification
def count_criteria_met(row):
    criteria_count = 0
    criteria_count += (row["ZHVF 1-Year Forecast (%)"] > thresholds["ZHVF 1-Year Forecast (%)"])
    criteria_count += (row["Percent Change"] > thresholds["Percent Change"])
    criteria_count += (row["Mean Income"] > thresholds["Mean Income"])
    criteria_count += (row["luxury_business_count"] > thresholds["luxury_business_count"])
    criteria_count += (thresholds["SizeRank_lower"] < row["SizeRank"] < thresholds["SizeRank_upper"])
    return criteria_count

# Create the gentrified column based on criteria
data["gentrified"] = data.apply(lambda row: 1 if count_criteria_met(row) >= 3 else 0, axis=1)

# Save the modified dataset
data.to_csv("modified_dataset.csv", index=False)


In [6]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/63/87/6cd5450f0385966bf2a5b865a2043cf68c2a41676193afdbccb40f8719dc/scikit_learn-1.3.0-cp39-cp39-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.3.0-cp39-cp39-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/77/31/b063f21370c6050a663aae5a9868d2fe112b21caeface3c248016dfea092/scipy-1.11.2-cp39-cp39-macosx_12_0_arm64.whl.metadata
  Downloading scipy-1.11.2-cp39-cp39-macosx_12_0_arm64.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.1.1 (from scikit-learn)
  Obtaining dependency information for joblib>=1.1.1 from https://files.pythonhoste

In [8]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# Extract features and target
features = ["SizeRank", "ZHVF 1-Year Forecast (%)", "Percent Change", "Mean Income", "luxury_business_count"]
X = data[features]
y = data["gentrified"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
clf = LogisticRegression(random_state=42)
clf.fit(X_train_scaled, y_train)

# Save the model and scaler
joblib.dump(clf, 'gentrification_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [10]:

# Make predictions on the test set
y_pred = clf.predict(X_test_scaled)

# Create a dataframe with test set ZIP codes, actual values, and predictions
results = pd.DataFrame({
    'ZIP Code': X_test.index,
    'Actual Value': y_test,
    'Predicted Value': y_pred
})

# Save the predictions to a CSV
results.to_csv('model_predictions.csv', index=False)
