diff --git a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb new file mode 100644 index 0000000..be872f5 --- /dev/null +++ b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb @@ -0,0 +1,546 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tE3akitQdA-m" + }, + "source": [ + "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n", + "\n", + "Useful Resources:\n", + "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n", + "2. https://github.com/google-research/population-dynamics/tree/master/notebooks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TbTs9lKqddKS" + }, + "source": [ + "Acknowledgements:\n", + "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JqjIF4kAKZGR" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install leafmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import os\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0z3FKT1fgePa" + }, + "source": [ + "# Get US PM2.5 data\n", + "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8VYkB_TeA0kP" + }, + "outputs": [], + "source": [ + "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part1\")\n", + "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part2\")\n", + "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part3\")\n", + "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part4\")\n", + "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part5\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GYC_GWkdgodj" + }, + "source": [ + "# Process PM2.5 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "QE3MZiRYA6q1", + "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5" + }, + "outputs": [], + "source": [ + "df = pd.concat([df0, df1, df2, df3, df4], ignore_index=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p9NCYMyxGBQB", + "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e" + }, + "outputs": [], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ip7NBwl-EDsf", + "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221" + }, + "outputs": [], + "source": [ + "df[\"zcta\"].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "id": "8qtI9VgiCcvj", + "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4" + }, + "outputs": [], + "source": [ + "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "id": "6-qSy8lVDDoM", + "outputId": "1f086442-221a-4f64-ee9d-db456bb21808" + }, + "outputs": [], + "source": [ + "pm25_df.dropna(axis=0, inplace=True)\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 290 + }, + "id": "5kAxQ3uVGswD", + "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3" + }, + "outputs": [], + "source": [ + "pm25_df.index = pm25_df.index.astype(int)\n", + "print(pm25_df.shape)\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + }, + "id": "dBOUhYaOIU0-", + "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc" + }, + "outputs": [], + "source": [ + "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n", + "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0CUwjIxaeWCm" + }, + "source": [ + "# Request access to PDFM Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yxEkU1GjibgC", + "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218" + }, + "outputs": [], + "source": [ + "!unzip /content/pdfm_embeddings.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-DugeFaSG2Pi" + }, + "outputs": [], + "source": [ + "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JZZX7tzlHYa9" + }, + "outputs": [], + "source": [ + "if not os.path.exists(embeddings_file_path):\n", + " raise FileNotFoundError(\"Please request the embeddings from Google\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "id": "cWNluHJYHY57", + "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd" + }, + "outputs": [], + "source": [ + "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n", + "zipcode_embeddings.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L7XAkE0fecVU" + }, + "source": [ + "# Join PDFM embeddings and Groud Truth (PM2.5 data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 429 + }, + "id": "KjwmUj5SH9-P", + "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433" + }, + "outputs": [], + "source": [ + "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UUKLoxY6JkUt", + "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2" + }, + "outputs": [], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NvHQ97WYJmOd" + }, + "outputs": [], + "source": [ + "embedding_features = [f\"feature{x}\" for x in range(330)]\n", + "label = \"pm25\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "35igKORxJ15A" + }, + "outputs": [], + "source": [ + "data = data.dropna(subset=[label])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tuCYm8dcevqo" + }, + "source": [ + "# Split Train and Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IgSmPhJTJ2QW" + }, + "outputs": [], + "source": [ + "data = data[embedding_features + [label]]\n", + "X = data[embedding_features]\n", + "y = data[label]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iHys75z1fFFJ" + }, + "source": [ + "# Fit K-Nearest Neighbors Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KCgs-cyoJ5Nm" + }, + "outputs": [], + "source": [ + "k = 5\n", + "model = KNeighborsRegressor(n_neighbors=k)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TkWTkqPCKEYu", + "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9" + }, + "outputs": [], + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ywbC7hLSfMZB" + }, + "source": [ + "# Evaluate K-Nearest Neighbors Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "id": "LZORS0JQKjQ4", + "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3" + }, + "outputs": [], + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SjOVAvPYkDYx" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LPXkNlenjGp_" + }, + "source": [ + "# Fit Random Forest Regressor model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M7S4naYnjN4l", + "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9" + }, + "outputs": [], + "source": [ + "model = RandomForestRegressor(n_estimators=10, verbose=10, n_jobs=-1)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CaxFfeEtjRiH", + "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3" + }, + "outputs": [], + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v96jVyGRjWtE" + }, + "source": [ + "# Evaluate Random Forest Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "id": "bVnYyd-zjb85", + "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d" + }, + "outputs": [], + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}