## Gentrification Prediction Model

classify dataset

In [3]:

import pandas as pd

# Load the dataset
data = pd.read_csv("../data/Final_Dataset/final_merged_dataset.csv")

# Calculate thresholds based on percentiles
thresholds = {
    "ZHVF 1-Year Forecast (%)": data["ZHVF 1-Year Forecast (%)"].quantile(0.50),
    "Percent Change": data["Percent Change"].quantile(0.50),
    "Mean Income": data["Mean Income"].quantile(0.50),
    "luxury_business_count": data["luxury_business_count"].quantile(0.50),
    "SizeRank_lower": data["SizeRank"].quantile(0.40),
    "SizeRank_upper": data["SizeRank"].quantile(0.60)
}

# Function to count criteria met for gentrification
def count_criteria_met(row):
    criteria_count = 0
    criteria_count += (row["ZHVF 1-Year Forecast (%)"] > thresholds["ZHVF 1-Year Forecast (%)"])
    criteria_count += (row["Percent Change"] > thresholds["Percent Change"])
    criteria_count += (row["Mean Income"] > thresholds["Mean Income"])
    criteria_count += (row["luxury_business_count"] > thresholds["luxury_business_count"])
    criteria_count += (thresholds["SizeRank_lower"] < row["SizeRank"] < thresholds["SizeRank_upper"])
    return criteria_count

# Create the gentrified column based on criteria
data["gentrified"] = data.apply(lambda row: 1 if count_criteria_met(row) >= 3 else 0, axis=1)

# Save the modified dataset
data.to_csv("modified_dataset.csv", index=False)


In [4]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [5]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import joblib

# Extract features and target
features = ["SizeRank", "ZHVF 1-Year Forecast (%)", "Percent Change", "Mean Income", "luxury_business_count"]
X = data[features]
y = data["gentrified"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
clf = LogisticRegression(random_state=42)
clf.fit(X_train_scaled, y_train)

# Save the model and scaler
joblib.dump(clf, 'gentrification_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [6]:

# Make predictions on the test set
y_pred = clf.predict(X_test_scaled)

# Create a dataframe with test set ZIP codes, actual values, and predictions
results = pd.DataFrame({
    'ZIP Code': X_test.index,
    'Actual Value': y_test,
    'Predicted Value': y_pred
})

# Save the predictions to a CSV
results.to_csv('model_predictions.csv', index=False)


In [7]:
import pandas as pd

# Load the new dataset
new_data = pd.read_csv("../data/Final_Dataset/final_merged_dataset.csv")

# Ensure it has the required features
required_features = ["SizeRank", "ZHVF 1-Year Forecast (%)", "Percent Change", "Mean Income", "luxury_business_count"]
X_new = new_data[required_features]

# Use the previously defined scaler to standardize these features
X_new_scaled = scaler.transform(X_new)

# Use the trained model to make predictions
y_new_pred = clf.predict(X_new_scaled)

# Add the predictions to the new dataset
new_data["predicted_gentrified"] = y_new_pred

# Optionally, save the new dataset with predictions to a CSV
new_data.to_csv("new_dataset_with_predictions.csv", index=False)

In [8]:
predicted_data = pd.read_csv('new_dataset_with_predictions.csv')
gentrified = predicted_data["predicted_gentrified"].value_counts().get(1,0)
print(gentrified)

306


In [9]:
import sqlite3
db_file = "data.db"
conn = sqlite3.connect(db_file)
table_name = "table1"
new_data.to_sql(table_name, conn, if_exists="replace", index=False)
conn.commit()
conn.close()