From ab12976b4a17764be14dbf0ada4eb1e2cea2cb4f Mon Sep 17 00:00:00 2001 From: Aninda Goswamy <39881731+anindabitm@users.noreply.github.com> Date: Thu, 20 Feb 2025 16:02:11 +0530 Subject: [PATCH 1/2] Predicting US PM2.5 levels using PDFM Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model --- PDFM_embeddings_to_predict_PM2_5_in_US.ipynb | 3522 ++++++++++++++++++ 1 file changed, 3522 insertions(+) create mode 100644 PDFM_embeddings_to_predict_PM2_5_in_US.ipynb diff --git a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb new file mode 100644 index 0000000..ce42a4f --- /dev/null +++ b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb @@ -0,0 +1,3522 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n", + "\n", + "Useful Resources:\n", + "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n", + "2. https://github.com/google-research/population-dynamics/tree/master/notebooks" + ], + "metadata": { + "id": "tE3akitQdA-m" + } + }, + { + "cell_type": "markdown", + "source": [ + "Acknowledgements:\n", + "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)" + ], + "metadata": { + "id": "TbTs9lKqddKS" + } + }, + { + "cell_type": "code", + "source": [ + "%%capture\n", + "!pip install leafmap" + ], + "metadata": { + "id": "JqjIF4kAKZGR" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "#import libraries\n", + "import pandas as pd\n", + "import os\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Get US PM2.5 data\n", + "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about" + ], + "metadata": { + "id": "0z3FKT1fgePa" + } + }, + { + "cell_type": "code", + "source": [ + "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part1\")\n", + "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part2\")\n", + "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part3\")\n", + "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part4\")\n", + "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part5\")" + ], + "metadata": { + "id": "8VYkB_TeA0kP" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Process PM2.5 data" + ], + "metadata": { + "id": "GYC_GWkdgodj" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.concat([df0,df1,df2,df3,df4],ignore_index=True)\n", + "df.head()" + ], + "metadata": { + "id": "QE3MZiRYA6q1", + "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + } + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Unnamed: 0 year ZIP zcta popdensity ... asian_pop white_pop medhouseholdincome pm25 urban\n", + "0 1 2000 1 NaN NaN ... NaN NaN NaN NaN NaN\n", + "1 2 2001 1 NaN NaN ... NaN NaN NaN NaN NaN\n", + "2 3 2002 1 NaN NaN ... NaN NaN NaN NaN NaN\n", + "3 4 2003 1 NaN NaN ... NaN NaN NaN NaN NaN\n", + "4 5 2004 1 NaN NaN ... NaN NaN NaN NaN NaN\n", + "\n", + "[5 rows x 21 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0yearZIPzctapopdensitypopulationpovertyeducationpct_blkpct_hisppct_nativepct_asianpct_whiteblack_pophisp_popnative_popasian_popwhite_popmedhouseholdincomepm25urban
0120001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1220011NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2320021NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3420031NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4520041NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df.shape" + ], + "metadata": { + "id": "p9NCYMyxGBQB", + "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(789260, 21)" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df[\"zcta\"].nunique()" + ], + "metadata": { + "id": "ip7NBwl-EDsf", + "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "32406" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n", + "pm25_df.head()" + ], + "metadata": { + "id": "8qtI9VgiCcvj", + "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + } + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "zcta\n", + "601.0 NaN\n", + "602.0 NaN\n", + "603.0 NaN\n", + "606.0 NaN\n", + "610.0 NaN\n", + "Name: pm25, dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pm25
zcta
601.0NaN
602.0NaN
603.0NaN
606.0NaN
610.0NaN
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pm25_df.dropna(axis=0,inplace=True)\n", + "pm25_df.head()" + ], + "metadata": { + "id": "6-qSy8lVDDoM", + "outputId": "1f086442-221a-4f64-ee9d-db456bb21808", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + } + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "zcta\n", + "1001.0 9.398180\n", + "1002.0 8.026795\n", + "1003.0 8.949020\n", + "1005.0 6.409811\n", + "1007.0 7.375929\n", + "Name: pm25, dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pm25
zcta
1001.09.398180
1002.08.026795
1003.08.949020
1005.06.409811
1007.07.375929
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pm25_df.index = pm25_df.index.astype(int)\n", + "print(pm25_df.shape)\n", + "pm25_df.head()" + ], + "metadata": { + "id": "5kAxQ3uVGswD", + "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 290 + } + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "(31956,)\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "zcta\n", + "1001 9.398180\n", + "1002 8.026795\n", + "1003 8.949020\n", + "1005 6.409811\n", + "1007 7.375929\n", + "Name: pm25, dtype: float64" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pm25
zcta
10019.398180
10028.026795
10038.949020
10056.409811
10077.375929
\n", + "

" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n", + "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n", + "pm25_df.head()" + ], + "metadata": { + "id": "dBOUhYaOIU0-", + "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + } + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " zcta pm25\n", + "zcta \n", + "zip/1001 1001 9.398180\n", + "zip/1002 1002 8.026795\n", + "zip/1003 1003 8.949020\n", + "zip/1005 1005 6.409811\n", + "zip/1007 1007 7.375929" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zctapm25
zcta
zip/100110019.398180
zip/100210028.026795
zip/100310038.949020
zip/100510056.409811
zip/100710077.375929
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "pm25_df", + "repr_error": "cannot insert zcta, already exists" + } + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Request access to PDFM Embeddings" + ], + "metadata": { + "id": "0CUwjIxaeWCm" + } + }, + { + "cell_type": "code", + "source": [ + "!unzip /content/pdfm_embeddings.zip" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yxEkU1GjibgC", + "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: /content/pdfm_embeddings.zip\n", + " creating: pdfm_embeddings/\n", + " creating: pdfm_embeddings/v0/\n", + " creating: pdfm_embeddings/v0/us/\n", + " inflating: pdfm_embeddings/v0/us/county.geojson \n", + " inflating: pdfm_embeddings/v0/us/county_embeddings.csv \n", + " inflating: pdfm_embeddings/v0/us/zcta.geojson \n", + " inflating: pdfm_embeddings/v0/us/zcta_embeddings.csv \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\"" + ], + "metadata": { + "id": "-DugeFaSG2Pi" + }, + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "if not os.path.exists(embeddings_file_path):\n", + " raise FileNotFoundError(\"Please request the embeddings from Google\")" + ], + "metadata": { + "id": "JZZX7tzlHYa9" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n", + "zipcode_embeddings.head()" + ], + "metadata": { + "id": "cWNluHJYHY57", + "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + } + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " state county city ... feature327 feature328 feature329\n", + "place ... \n", + "zip/97910 OR Malheur County Jordan Valley ... -0.001661 -0.001010 4.495589\n", + "zip/89412 NV Washoe County Gerlach ... -0.024385 -0.000295 3.399393\n", + "zip/88030 NM Luna County Deming ... -0.116499 -0.051163 3.866543\n", + "zip/82633 WY Converse County Douglas ... -0.047864 -0.000042 7.453567\n", + "zip/59538 MT Phillips County Malta ... -0.161916 -0.001087 0.972243\n", + "\n", + "[5 rows x 336 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statecountycitypopulationlatitudelongitudefeature0feature1feature2feature3feature4feature5feature6feature7feature8feature9feature10feature11feature12feature13feature14feature15feature16feature17feature18feature19feature20feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30feature31feature32feature33feature34feature35feature36feature37feature38feature39feature40feature41feature42feature43...feature280feature281feature282feature283feature284feature285feature286feature287feature288feature289feature290feature291feature292feature293feature294feature295feature296feature297feature298feature299feature300feature301feature302feature303feature304feature305feature306feature307feature308feature309feature310feature311feature312feature313feature314feature315feature316feature317feature318feature319feature320feature321feature322feature323feature324feature325feature326feature327feature328feature329
place
zip/97910ORMalheur CountyJordan Valley60942.749076-117.511459-0.1382271.1203770.0729000.2974420.7726730.2864670.8023980.6208470.0608100.1259260.4529050.3172101.5604880.1737170.338584-0.0118760.3699180.734241-0.0231610.9279180.1311290.1749150.1869620.777327-0.1252550.2529970.1267030.2827130.2862170.1082220.1380431.8938930.0343410.1161971.5786180.5945980.0833960.2177050.0858330.0509590.0713890.0467940.9007410.089737...0.7954634.0488264.411071-0.0725851.1728042.780721-0.007037-0.167074-0.1690712.4516424.7057452.8610440.9647615.425952-0.086446-0.0040441.692680-0.1293041.2627670.5843934.957980-0.1136193.977844-0.0562660.1546796.833614-0.1685953.852105-0.0086000.1683672.6726796.9380711.4625264.7003793.523755-0.1699710.279797-0.030630-0.0000144.489360-0.158891-0.1687081.231994-0.1557653.043214-0.1697490.177463-0.001661-0.0010104.495589
zip/89412NVWashoe CountyGerlach9841.102934-119.695361-0.1413791.4227820.2342690.1591560.8902410.2154270.5332001.1258300.1598910.3054490.6734480.2223711.1131960.1470470.2708580.1400510.5913811.3218580.1829561.6001260.1494801.2394540.3261860.724274-0.0570490.3277220.2250440.0928650.6364250.5410340.1688661.489893-0.0032550.5243511.6726420.4212620.7015390.2310940.2235120.1344800.4798520.2044051.0437660.396896...-0.0807315.3577233.973490-0.1005552.9424364.120401-0.027611-0.084821-0.0000681.3647406.0856805.550473-0.0579297.677364-0.1114880.4422422.686013-0.008651-0.0032874.6853846.117020-0.0833244.441121-0.002397-0.1199434.539424-0.0066764.993636-0.118508-0.1690391.8649534.146715-0.1182203.6888824.046134-0.0455371.627209-0.012242-0.0166434.668972-0.157417-0.0436062.788701-0.0625473.700745-0.169827-0.137990-0.024385-0.0002953.399393
zip/88030NMLuna CountyDeming2413932.191634-107.729431-0.0466661.4144240.1468031.1132561.1195761.0931990.9606360.1796420.7294882.4474392.2742042.7653250.9032840.5201622.6043480.6885200.1644362.7558281.3122460.4522690.6126601.1352951.4404660.507069-0.1408090.4813061.0687172.1886970.2543981.0192340.2777270.7164910.8611361.2322560.2103260.6940311.5040181.4303610.8425510.2220431.1145560.8564251.5187911.487212...0.4724494.0895551.3473470.1283492.5178920.7180340.5460532.494347-0.0248883.6581475.0963043.687950-0.1625214.8446360.7890833.0256700.667166-0.169164-0.1098913.0951284.816823-0.1698834.487709-0.058803-0.1673332.843048-0.0605443.279263-0.1586991.5351892.7872313.8619161.5691193.4872992.3346930.068785-0.162307-0.0538490.1605041.895565-0.0006540.4374754.2292950.2291992.0984691.1504970.716122-0.116499-0.0511633.866543
zip/82633WYConverse CountyDouglas947843.022270-105.410250-0.0902931.2662800.4478680.7818611.7318130.6027220.7370660.7433920.8236581.4752001.6397340.2023400.5459460.4861710.4257580.9515570.4481310.8894091.1162650.3313080.6942450.7740920.8934762.088896-0.0127671.3794200.5419440.9030941.2451580.5147470.2405201.8533850.6914781.0950860.1027790.9763971.5081521.0987090.6589311.5449333.2679901.0330220.9482430.768377...4.7097112.6199310.377791-0.1360901.5469291.914665-0.038279-0.1582913.8462241.6008722.5562405.0282413.1315691.8852511.7231523.2876590.592335-0.1696790.7995711.7110866.4347992.2594575.137226-0.1573763.7392571.8493440.8171784.254727-0.0314550.8603553.1857684.8155371.8895623.1471585.9028750.248916-0.013526-0.035991-0.0374672.813852-0.0337710.5797752.6886650.1756690.9909211.6448790.222517-0.047864-0.0000427.453567
zip/59538MTPhillips CountyMalta293648.112019-107.845520-0.0928861.256203-0.0508970.3219541.2818640.7377931.6621780.4510610.190265-0.1277650.5061150.7921370.3855070.3949260.4777611.0282060.0476810.8797400.7957300.2391350.2820840.1933261.2620940.453796-0.1693510.6013230.6703640.5819920.5400120.2189761.1954830.0351990.2742110.6815940.8199161.2347350.289213-0.0108910.9113120.7801662.9065060.5247231.004237-0.098108...0.7414100.4358250.415687-0.1685351.068465-0.014837-0.058268-0.168225-0.0843274.2620600.4449362.5040240.5346121.3660060.0862761.7662713.652062-0.162912-0.1028372.1234316.335544-0.1585362.916174-0.0005544.0131701.245277-0.1461091.6305250.193676-0.1324761.4476614.800499-0.0099520.2171681.960558-0.0804720.211844-0.045951-0.012506-0.169497-0.169915-0.0888290.338914-0.102962-0.1565831.4936962.259007-0.161916-0.0010870.972243
\n", + "

5 rows × 336 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "zipcode_embeddings" + } + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Join PDFM embeddings and Groud Truth (PM2.5 data)" + ], + "metadata": { + "id": "L7XAkE0fecVU" + } + }, + { + "cell_type": "code", + "source": [ + "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n", + "data.head()" + ], + "metadata": { + "id": "KjwmUj5SH9-P", + "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 429 + } + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " zcta pm25 state county ... feature326 feature327 feature328 feature329\n", + "zip/10001 10001 13.636975 NY New York County ... 1.581655 -0.015907 -0.0 3.977647\n", + "zip/10002 10002 12.896079 NY New York County ... 5.388407 -0.112462 -0.0 4.431747\n", + "zip/10003 10003 12.829412 NY New York County ... 4.039156 -0.156848 -0.0 5.094444\n", + "zip/10004 10004 13.851765 NY New York County ... 6.799802 -0.078682 -0.0 4.140815\n", + "zip/10005 10005 13.730000 NY New York County ... 7.295258 -0.169108 -0.0 3.934241\n", + "\n", + "[5 rows x 338 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zctapm25statecountycitypopulationlatitudelongitudefeature0feature1feature2feature3feature4feature5feature6feature7feature8feature9feature10feature11feature12feature13feature14feature15feature16feature17feature18feature19feature20feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30feature31feature32feature33feature34feature35feature36feature37feature38feature39feature40feature41...feature280feature281feature282feature283feature284feature285feature286feature287feature288feature289feature290feature291feature292feature293feature294feature295feature296feature297feature298feature299feature300feature301feature302feature303feature304feature305feature306feature307feature308feature309feature310feature311feature312feature313feature314feature315feature316feature317feature318feature319feature320feature321feature322feature323feature324feature325feature326feature327feature328feature329
zip/100011000113.636975NYNew York CountyNew York2696640.750672-73.997281-0.0730250.6216113.2040591.2192500.0866660.4227331.3622520.4482270.7807890.0761760.7448890.2061531.4862222.3416262.9349663.1976502.2272430.5367041.6419040.2157330.0847270.1646794.5111022.252191-0.0243150.2122653.8570312.2177710.3087664.0347351.6955190.567224-0.0773463.9538241.4250992.2758401.0427351.4901751.5572180.2061690.0495732.313412...1.501178-0.123201-0.167260-0.159472-0.001858-0.1299800.175728-1.200000e-07-0.008414-0.1553630.315214-0.009080-0.0-0.000000e+000.770695-0.152233-0.0496598.2094900.7008201.0008275.112181-0.1469531.654920-0.002328-0.004713-0.1620209.8142372.8720817.9058579.6326217.280638-0.0172870.4797471.1915476.030477-0.028483-0.002579-0.150458-0.145124-0.000732-0.0068552.571859-0.0060764.313338-0.105290-9.500000e-061.581655-0.015907-0.03.977647
zip/100021000212.896079NYNew York CountyNew York7680740.715762-73.986258-0.1447691.1022723.9807741.7518270.0829820.4745150.9498660.3798721.3949390.2391890.8318120.2316311.9853572.1120553.2094433.6519241.3691140.4494092.2784840.1879110.3427430.1780305.8587402.435344-0.0111250.2031723.5862072.0160040.2929034.1687022.1469751.026136-0.0851504.9311331.6647302.5723700.7167052.2254872.0631850.2763540.2723573.370650...0.361146-0.096761-0.146408-0.155304-0.074113-0.0150541.665584-1.102000e-05-0.002935-0.1696030.145326-0.025443-0.0-5.000000e-08-0.161510-0.153942-0.1664106.3761851.1634281.1389285.270078-0.1599241.996964-0.000041-0.020128-0.0506408.7715382.0861008.6025968.0601938.714463-0.0046880.1361990.5139825.643821-0.002350-0.008118-0.089103-0.156498-0.034799-0.1030222.710451-0.0000214.655519-0.159014-0.000000e+005.388407-0.112462-0.04.431747
zip/100031000312.829412NYNew York CountyNew York5444740.731829-73.989181-0.1314611.1320393.5121901.1768000.1617160.4593701.2910740.4754961.1464700.1990020.9686850.2490581.7881392.2959153.0391283.7267801.8224610.5469151.9554720.1888800.1981700.1385935.4824262.644991-0.0309930.2567503.7471682.1609550.3117894.1316262.0502550.924002-0.0777894.4016831.4903322.3791280.8712062.1519701.9291990.3612980.2296503.038753...0.461322-0.141985-0.085863-0.156253-0.005502-0.0468461.596130-4.810000e-06-0.089342-0.0995400.272569-0.009312-0.0-0.000000e+000.092946-0.169363-0.0425576.3766210.6064840.7780495.227136-0.1698882.206395-0.000619-0.166381-0.1673458.3668282.2904467.7625577.9475796.410265-0.0209470.9355080.4151925.355077-0.027503-0.000485-0.1231360.030198-0.001412-0.0047392.824203-0.0002984.061482-0.068253-1.350000e-064.039156-0.156848-0.05.094444
zip/100041000413.851765NYNew York CountyNew York479540.688630-74.018244-0.1476250.5467873.2294921.8868640.1086210.6030101.4427130.4711501.0288820.4117300.7083820.3468371.7986581.9404263.4354312.6930041.9563800.3004292.1739640.4409530.3398110.2757735.2715662.132799-0.0183420.3981953.2074841.4888410.5269574.3309311.8064101.111506-0.1184414.1217511.6049981.9961960.6748021.7560822.7415660.2118580.2898912.604892...0.393506-0.142817-0.135064-0.018993-0.057043-0.1316300.758817-0.000000e+00-0.000007-0.1346960.017068-0.007268-0.0-3.080000e-06-0.152431-0.105453-0.1410281.6790860.5341551.8516125.1750221.4382962.622611-0.000090-0.004313-0.1196872.1665973.7678998.0355784.9235876.152561-0.007616-0.0721990.0525213.980608-0.000069-0.000103-0.0270720.367801-0.003337-0.0104221.127201-0.0000793.496159-0.079587-8.000000e-086.799802-0.078682-0.04.140815
zip/100051000513.730000NYNew York CountyNew York863740.705974-74.008768-0.1428610.7785213.6405631.7745480.1031400.4621981.2381120.3626211.0670660.2403921.0121440.4501932.0653492.0761803.0301343.4653731.8889400.3740232.3237220.3071620.2439690.2053385.6401882.588103-0.0153750.3245943.7378641.6779350.3796594.3782352.0422961.018322-0.1093114.7153981.7573642.5008150.8889961.9595772.2038780.2273360.2436043.124810...-0.117290-0.084119-0.169264-0.115257-0.010591-0.0262481.397036-0.000000e+00-0.000242-0.1674690.550521-0.013257-0.0-3.500000e-07-0.1569890.212693-0.1332203.3823430.7180640.7326485.0143840.1573892.529759-0.000866-0.002382-0.0721323.9055902.8311048.1657866.1364556.366154-0.0074920.567445-0.1017463.834963-0.009529-0.0005050.0286451.454115-0.001952-0.0118452.385040-0.0000173.454340-0.159854-1.320000e-067.295258-0.169108-0.03.934241
\n", + "

5 rows × 338 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "data" + } + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "data.shape" + ], + "metadata": { + "id": "UUKLoxY6JkUt", + "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(29298, 338)" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "embedding_features = [f\"feature{x}\" for x in range(330)]\n", + "label = \"pm25\"" + ], + "metadata": { + "id": "NvHQ97WYJmOd" + }, + "execution_count": 19, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data = data.dropna(subset=[label])" + ], + "metadata": { + "id": "35igKORxJ15A" + }, + "execution_count": 20, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Split Train and Test Data" + ], + "metadata": { + "id": "tuCYm8dcevqo" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "data = data[embedding_features + [label]]\n", + "X = data[embedding_features]\n", + "y = data[label]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")" + ], + "metadata": { + "id": "IgSmPhJTJ2QW" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Fit K-Nearest Neighbors Model" + ], + "metadata": { + "id": "iHys75z1fFFJ" + } + }, + { + "cell_type": "code", + "source": [ + "k = 5\n", + "model = KNeighborsRegressor(n_neighbors=k)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ], + "metadata": { + "id": "KCgs-cyoJ5Nm" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ], + "metadata": { + "id": "TkWTkqPCKEYu", + "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'r2': 0.9423382724431623, 'r': 0.9708871328512507, 'rmse': 0.6494730499555952, 'mae': 0.4204252179218217, 'mape': 0.05619889943563576}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Evaluate K-Nearest Neighbors Model" + ], + "metadata": { + "id": "ywbC7hLSfMZB" + } + }, + { + "cell_type": "code", + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ], + "metadata": { + "id": "LZORS0JQKjQ4", + "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + } + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "SjOVAvPYkDYx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Fit Random Forest Regressor model" + ], + "metadata": { + "id": "LPXkNlenjGp_" + } + }, + { + "cell_type": "code", + "source": [ + "model = RandomForestRegressor(n_estimators=10,verbose=10,n_jobs=-1)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M7S4naYnjN4l", + "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "building tree 1 of 10building tree 2 of 10\n", + "\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 20.0s\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "building tree 3 of 10\n", + "building tree 4 of 10\n", + "building tree 5 of 10\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[Parallel(n_jobs=-1)]: Done 4 tasks | elapsed: 38.8s\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "building tree 6 of 10\n", + "building tree 7 of 10\n", + "building tree 8 of 10\n", + "building tree 9 of 10\n", + "building tree 10 of 10\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.6min finished\n", + "[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.\n", + "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 0.0s\n", + "[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 0.0s finished\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CaxFfeEtjRiH", + "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'r2': 0.9086928793143653, 'r': 0.9539294211355118, 'rmse': 0.817277670967286, 'mae': 0.568816793219495, 'mape': 0.07572915522396176}\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Evaluate Random Forest Model" + ], + "metadata": { + "id": "v96jVyGRjWtE" + } + }, + { + "cell_type": "code", + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "id": "bVnYyd-zjb85", + "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 62b19ac7c6e892ab52adbc6f47be7b7a5ed56114 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Feb 2025 10:34:00 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- PDFM_embeddings_to_predict_PM2_5_in_US.ipynb | 4062 +++--------------- 1 file changed, 543 insertions(+), 3519 deletions(-) diff --git a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb index ce42a4f..be872f5 100644 --- a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb +++ b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb @@ -1,3522 +1,546 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n", - "\n", - "Useful Resources:\n", - "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n", - "2. https://github.com/google-research/population-dynamics/tree/master/notebooks" - ], - "metadata": { - "id": "tE3akitQdA-m" - } - }, - { - "cell_type": "markdown", - "source": [ - "Acknowledgements:\n", - "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)" - ], - "metadata": { - "id": "TbTs9lKqddKS" - } - }, - { - "cell_type": "code", - "source": [ - "%%capture\n", - "!pip install leafmap" - ], - "metadata": { - "id": "JqjIF4kAKZGR" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "lIYdn1woOS1n" - }, - "outputs": [], - "source": [ - "#import libraries\n", - "import pandas as pd\n", - "import os\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "from sklearn.neighbors import KNeighborsRegressor\n", - "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Get US PM2.5 data\n", - "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about" - ], - "metadata": { - "id": "0z3FKT1fgePa" - } - }, - { - "cell_type": "code", - "source": [ - "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part1\")\n", - "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part2\")\n", - "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part3\")\n", - "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part4\")\n", - "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part5\")" - ], - "metadata": { - "id": "8VYkB_TeA0kP" - }, - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Process PM2.5 data" - ], - "metadata": { - "id": "GYC_GWkdgodj" - } - }, - { - "cell_type": "code", - "source": [ - "df = pd.concat([df0,df1,df2,df3,df4],ignore_index=True)\n", - "df.head()" - ], - "metadata": { - "id": "QE3MZiRYA6q1", - "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - } - }, - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Unnamed: 0 year ZIP zcta popdensity ... asian_pop white_pop medhouseholdincome pm25 urban\n", - "0 1 2000 1 NaN NaN ... NaN NaN NaN NaN NaN\n", - "1 2 2001 1 NaN NaN ... NaN NaN NaN NaN NaN\n", - "2 3 2002 1 NaN NaN ... NaN NaN NaN NaN NaN\n", - "3 4 2003 1 NaN NaN ... NaN NaN NaN NaN NaN\n", - "4 5 2004 1 NaN NaN ... NaN NaN NaN NaN NaN\n", - "\n", - "[5 rows x 21 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0yearZIPzctapopdensitypopulationpovertyeducationpct_blkpct_hisppct_nativepct_asianpct_whiteblack_pophisp_popnative_popasian_popwhite_popmedhouseholdincomepm25urban
0120001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1220011NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2320021NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3420031NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4520041NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "df" - } - }, - "metadata": {}, - "execution_count": 5 - } - ] - }, - { - "cell_type": "code", - "source": [ - "df.shape" - ], - "metadata": { - "id": "p9NCYMyxGBQB", - "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(789260, 21)" - ] - }, - "metadata": {}, - "execution_count": 6 - } - ] - }, - { - "cell_type": "code", - "source": [ - "df[\"zcta\"].nunique()" - ], - "metadata": { - "id": "ip7NBwl-EDsf", - "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "32406" - ] - }, - "metadata": {}, - "execution_count": 7 - } - ] - }, - { - "cell_type": "code", - "source": [ - "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n", - "pm25_df.head()" - ], - "metadata": { - "id": "8qtI9VgiCcvj", - "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 272 - } - }, - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "zcta\n", - "601.0 NaN\n", - "602.0 NaN\n", - "603.0 NaN\n", - "606.0 NaN\n", - "610.0 NaN\n", - "Name: pm25, dtype: float64" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pm25
zcta
601.0NaN
602.0NaN
603.0NaN
606.0NaN
610.0NaN
\n", - "

" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ] - }, - { - "cell_type": "code", - "source": [ - "pm25_df.dropna(axis=0,inplace=True)\n", - "pm25_df.head()" - ], - "metadata": { - "id": "6-qSy8lVDDoM", - "outputId": "1f086442-221a-4f64-ee9d-db456bb21808", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 272 - } - }, - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "zcta\n", - "1001.0 9.398180\n", - "1002.0 8.026795\n", - "1003.0 8.949020\n", - "1005.0 6.409811\n", - "1007.0 7.375929\n", - "Name: pm25, dtype: float64" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pm25
zcta
1001.09.398180
1002.08.026795
1003.08.949020
1005.06.409811
1007.07.375929
\n", - "

" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "code", - "source": [ - "pm25_df.index = pm25_df.index.astype(int)\n", - "print(pm25_df.shape)\n", - "pm25_df.head()" - ], - "metadata": { - "id": "5kAxQ3uVGswD", - "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 290 - } - }, - "execution_count": 10, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "(31956,)\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "zcta\n", - "1001 9.398180\n", - "1002 8.026795\n", - "1003 8.949020\n", - "1005 6.409811\n", - "1007 7.375929\n", - "Name: pm25, dtype: float64" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pm25
zcta
10019.398180
10028.026795
10038.949020
10056.409811
10077.375929
\n", - "

" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ] - }, - { - "cell_type": "code", - "source": [ - "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n", - "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n", - "pm25_df.head()" - ], - "metadata": { - "id": "dBOUhYaOIU0-", - "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 238 - } - }, - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " zcta pm25\n", - "zcta \n", - "zip/1001 1001 9.398180\n", - "zip/1002 1002 8.026795\n", - "zip/1003 1003 8.949020\n", - "zip/1005 1005 6.409811\n", - "zip/1007 1007 7.375929" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
zctapm25
zcta
zip/100110019.398180
zip/100210028.026795
zip/100310038.949020
zip/100510056.409811
zip/100710077.375929
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "pm25_df", - "repr_error": "cannot insert zcta, already exists" - } - }, - "metadata": {}, - "execution_count": 11 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Request access to PDFM Embeddings" - ], - "metadata": { - "id": "0CUwjIxaeWCm" - } - }, - { - "cell_type": "code", - "source": [ - "!unzip /content/pdfm_embeddings.zip" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yxEkU1GjibgC", - "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218" - }, - "execution_count": 14, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Archive: /content/pdfm_embeddings.zip\n", - " creating: pdfm_embeddings/\n", - " creating: pdfm_embeddings/v0/\n", - " creating: pdfm_embeddings/v0/us/\n", - " inflating: pdfm_embeddings/v0/us/county.geojson \n", - " inflating: pdfm_embeddings/v0/us/county_embeddings.csv \n", - " inflating: pdfm_embeddings/v0/us/zcta.geojson \n", - " inflating: pdfm_embeddings/v0/us/zcta_embeddings.csv \n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\"" - ], - "metadata": { - "id": "-DugeFaSG2Pi" - }, - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "if not os.path.exists(embeddings_file_path):\n", - " raise FileNotFoundError(\"Please request the embeddings from Google\")" - ], - "metadata": { - "id": "JZZX7tzlHYa9" - }, - "execution_count": 15, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n", - "zipcode_embeddings.head()" - ], - "metadata": { - "id": "cWNluHJYHY57", - "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 374 - } - }, - "execution_count": 16, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " state county city ... feature327 feature328 feature329\n", - "place ... \n", - "zip/97910 OR Malheur County Jordan Valley ... -0.001661 -0.001010 4.495589\n", - "zip/89412 NV Washoe County Gerlach ... -0.024385 -0.000295 3.399393\n", - "zip/88030 NM Luna County Deming ... -0.116499 -0.051163 3.866543\n", - "zip/82633 WY Converse County Douglas ... -0.047864 -0.000042 7.453567\n", - "zip/59538 MT Phillips County Malta ... -0.161916 -0.001087 0.972243\n", - "\n", - "[5 rows x 336 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
statecountycitypopulationlatitudelongitudefeature0feature1feature2feature3feature4feature5feature6feature7feature8feature9feature10feature11feature12feature13feature14feature15feature16feature17feature18feature19feature20feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30feature31feature32feature33feature34feature35feature36feature37feature38feature39feature40feature41feature42feature43...feature280feature281feature282feature283feature284feature285feature286feature287feature288feature289feature290feature291feature292feature293feature294feature295feature296feature297feature298feature299feature300feature301feature302feature303feature304feature305feature306feature307feature308feature309feature310feature311feature312feature313feature314feature315feature316feature317feature318feature319feature320feature321feature322feature323feature324feature325feature326feature327feature328feature329
place
zip/97910ORMalheur CountyJordan Valley60942.749076-117.511459-0.1382271.1203770.0729000.2974420.7726730.2864670.8023980.6208470.0608100.1259260.4529050.3172101.5604880.1737170.338584-0.0118760.3699180.734241-0.0231610.9279180.1311290.1749150.1869620.777327-0.1252550.2529970.1267030.2827130.2862170.1082220.1380431.8938930.0343410.1161971.5786180.5945980.0833960.2177050.0858330.0509590.0713890.0467940.9007410.089737...0.7954634.0488264.411071-0.0725851.1728042.780721-0.007037-0.167074-0.1690712.4516424.7057452.8610440.9647615.425952-0.086446-0.0040441.692680-0.1293041.2627670.5843934.957980-0.1136193.977844-0.0562660.1546796.833614-0.1685953.852105-0.0086000.1683672.6726796.9380711.4625264.7003793.523755-0.1699710.279797-0.030630-0.0000144.489360-0.158891-0.1687081.231994-0.1557653.043214-0.1697490.177463-0.001661-0.0010104.495589
zip/89412NVWashoe CountyGerlach9841.102934-119.695361-0.1413791.4227820.2342690.1591560.8902410.2154270.5332001.1258300.1598910.3054490.6734480.2223711.1131960.1470470.2708580.1400510.5913811.3218580.1829561.6001260.1494801.2394540.3261860.724274-0.0570490.3277220.2250440.0928650.6364250.5410340.1688661.489893-0.0032550.5243511.6726420.4212620.7015390.2310940.2235120.1344800.4798520.2044051.0437660.396896...-0.0807315.3577233.973490-0.1005552.9424364.120401-0.027611-0.084821-0.0000681.3647406.0856805.550473-0.0579297.677364-0.1114880.4422422.686013-0.008651-0.0032874.6853846.117020-0.0833244.441121-0.002397-0.1199434.539424-0.0066764.993636-0.118508-0.1690391.8649534.146715-0.1182203.6888824.046134-0.0455371.627209-0.012242-0.0166434.668972-0.157417-0.0436062.788701-0.0625473.700745-0.169827-0.137990-0.024385-0.0002953.399393
zip/88030NMLuna CountyDeming2413932.191634-107.729431-0.0466661.4144240.1468031.1132561.1195761.0931990.9606360.1796420.7294882.4474392.2742042.7653250.9032840.5201622.6043480.6885200.1644362.7558281.3122460.4522690.6126601.1352951.4404660.507069-0.1408090.4813061.0687172.1886970.2543981.0192340.2777270.7164910.8611361.2322560.2103260.6940311.5040181.4303610.8425510.2220431.1145560.8564251.5187911.487212...0.4724494.0895551.3473470.1283492.5178920.7180340.5460532.494347-0.0248883.6581475.0963043.687950-0.1625214.8446360.7890833.0256700.667166-0.169164-0.1098913.0951284.816823-0.1698834.487709-0.058803-0.1673332.843048-0.0605443.279263-0.1586991.5351892.7872313.8619161.5691193.4872992.3346930.068785-0.162307-0.0538490.1605041.895565-0.0006540.4374754.2292950.2291992.0984691.1504970.716122-0.116499-0.0511633.866543
zip/82633WYConverse CountyDouglas947843.022270-105.410250-0.0902931.2662800.4478680.7818611.7318130.6027220.7370660.7433920.8236581.4752001.6397340.2023400.5459460.4861710.4257580.9515570.4481310.8894091.1162650.3313080.6942450.7740920.8934762.088896-0.0127671.3794200.5419440.9030941.2451580.5147470.2405201.8533850.6914781.0950860.1027790.9763971.5081521.0987090.6589311.5449333.2679901.0330220.9482430.768377...4.7097112.6199310.377791-0.1360901.5469291.914665-0.038279-0.1582913.8462241.6008722.5562405.0282413.1315691.8852511.7231523.2876590.592335-0.1696790.7995711.7110866.4347992.2594575.137226-0.1573763.7392571.8493440.8171784.254727-0.0314550.8603553.1857684.8155371.8895623.1471585.9028750.248916-0.013526-0.035991-0.0374672.813852-0.0337710.5797752.6886650.1756690.9909211.6448790.222517-0.047864-0.0000427.453567
zip/59538MTPhillips CountyMalta293648.112019-107.845520-0.0928861.256203-0.0508970.3219541.2818640.7377931.6621780.4510610.190265-0.1277650.5061150.7921370.3855070.3949260.4777611.0282060.0476810.8797400.7957300.2391350.2820840.1933261.2620940.453796-0.1693510.6013230.6703640.5819920.5400120.2189761.1954830.0351990.2742110.6815940.8199161.2347350.289213-0.0108910.9113120.7801662.9065060.5247231.004237-0.098108...0.7414100.4358250.415687-0.1685351.068465-0.014837-0.058268-0.168225-0.0843274.2620600.4449362.5040240.5346121.3660060.0862761.7662713.652062-0.162912-0.1028372.1234316.335544-0.1585362.916174-0.0005544.0131701.245277-0.1461091.6305250.193676-0.1324761.4476614.800499-0.0099520.2171681.960558-0.0804720.211844-0.045951-0.012506-0.169497-0.169915-0.0888290.338914-0.102962-0.1565831.4936962.259007-0.161916-0.0010870.972243
\n", - "

5 rows × 336 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "zipcode_embeddings" - } - }, - "metadata": {}, - "execution_count": 16 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Join PDFM embeddings and Groud Truth (PM2.5 data)" - ], - "metadata": { - "id": "L7XAkE0fecVU" - } - }, - { - "cell_type": "code", - "source": [ - "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n", - "data.head()" - ], - "metadata": { - "id": "KjwmUj5SH9-P", - "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 429 - } - }, - "execution_count": 17, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " zcta pm25 state county ... feature326 feature327 feature328 feature329\n", - "zip/10001 10001 13.636975 NY New York County ... 1.581655 -0.015907 -0.0 3.977647\n", - "zip/10002 10002 12.896079 NY New York County ... 5.388407 -0.112462 -0.0 4.431747\n", - "zip/10003 10003 12.829412 NY New York County ... 4.039156 -0.156848 -0.0 5.094444\n", - "zip/10004 10004 13.851765 NY New York County ... 6.799802 -0.078682 -0.0 4.140815\n", - "zip/10005 10005 13.730000 NY New York County ... 7.295258 -0.169108 -0.0 3.934241\n", - "\n", - "[5 rows x 338 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
zctapm25statecountycitypopulationlatitudelongitudefeature0feature1feature2feature3feature4feature5feature6feature7feature8feature9feature10feature11feature12feature13feature14feature15feature16feature17feature18feature19feature20feature21feature22feature23feature24feature25feature26feature27feature28feature29feature30feature31feature32feature33feature34feature35feature36feature37feature38feature39feature40feature41...feature280feature281feature282feature283feature284feature285feature286feature287feature288feature289feature290feature291feature292feature293feature294feature295feature296feature297feature298feature299feature300feature301feature302feature303feature304feature305feature306feature307feature308feature309feature310feature311feature312feature313feature314feature315feature316feature317feature318feature319feature320feature321feature322feature323feature324feature325feature326feature327feature328feature329
zip/100011000113.636975NYNew York CountyNew York2696640.750672-73.997281-0.0730250.6216113.2040591.2192500.0866660.4227331.3622520.4482270.7807890.0761760.7448890.2061531.4862222.3416262.9349663.1976502.2272430.5367041.6419040.2157330.0847270.1646794.5111022.252191-0.0243150.2122653.8570312.2177710.3087664.0347351.6955190.567224-0.0773463.9538241.4250992.2758401.0427351.4901751.5572180.2061690.0495732.313412...1.501178-0.123201-0.167260-0.159472-0.001858-0.1299800.175728-1.200000e-07-0.008414-0.1553630.315214-0.009080-0.0-0.000000e+000.770695-0.152233-0.0496598.2094900.7008201.0008275.112181-0.1469531.654920-0.002328-0.004713-0.1620209.8142372.8720817.9058579.6326217.280638-0.0172870.4797471.1915476.030477-0.028483-0.002579-0.150458-0.145124-0.000732-0.0068552.571859-0.0060764.313338-0.105290-9.500000e-061.581655-0.015907-0.03.977647
zip/100021000212.896079NYNew York CountyNew York7680740.715762-73.986258-0.1447691.1022723.9807741.7518270.0829820.4745150.9498660.3798721.3949390.2391890.8318120.2316311.9853572.1120553.2094433.6519241.3691140.4494092.2784840.1879110.3427430.1780305.8587402.435344-0.0111250.2031723.5862072.0160040.2929034.1687022.1469751.026136-0.0851504.9311331.6647302.5723700.7167052.2254872.0631850.2763540.2723573.370650...0.361146-0.096761-0.146408-0.155304-0.074113-0.0150541.665584-1.102000e-05-0.002935-0.1696030.145326-0.025443-0.0-5.000000e-08-0.161510-0.153942-0.1664106.3761851.1634281.1389285.270078-0.1599241.996964-0.000041-0.020128-0.0506408.7715382.0861008.6025968.0601938.714463-0.0046880.1361990.5139825.643821-0.002350-0.008118-0.089103-0.156498-0.034799-0.1030222.710451-0.0000214.655519-0.159014-0.000000e+005.388407-0.112462-0.04.431747
zip/100031000312.829412NYNew York CountyNew York5444740.731829-73.989181-0.1314611.1320393.5121901.1768000.1617160.4593701.2910740.4754961.1464700.1990020.9686850.2490581.7881392.2959153.0391283.7267801.8224610.5469151.9554720.1888800.1981700.1385935.4824262.644991-0.0309930.2567503.7471682.1609550.3117894.1316262.0502550.924002-0.0777894.4016831.4903322.3791280.8712062.1519701.9291990.3612980.2296503.038753...0.461322-0.141985-0.085863-0.156253-0.005502-0.0468461.596130-4.810000e-06-0.089342-0.0995400.272569-0.009312-0.0-0.000000e+000.092946-0.169363-0.0425576.3766210.6064840.7780495.227136-0.1698882.206395-0.000619-0.166381-0.1673458.3668282.2904467.7625577.9475796.410265-0.0209470.9355080.4151925.355077-0.027503-0.000485-0.1231360.030198-0.001412-0.0047392.824203-0.0002984.061482-0.068253-1.350000e-064.039156-0.156848-0.05.094444
zip/100041000413.851765NYNew York CountyNew York479540.688630-74.018244-0.1476250.5467873.2294921.8868640.1086210.6030101.4427130.4711501.0288820.4117300.7083820.3468371.7986581.9404263.4354312.6930041.9563800.3004292.1739640.4409530.3398110.2757735.2715662.132799-0.0183420.3981953.2074841.4888410.5269574.3309311.8064101.111506-0.1184414.1217511.6049981.9961960.6748021.7560822.7415660.2118580.2898912.604892...0.393506-0.142817-0.135064-0.018993-0.057043-0.1316300.758817-0.000000e+00-0.000007-0.1346960.017068-0.007268-0.0-3.080000e-06-0.152431-0.105453-0.1410281.6790860.5341551.8516125.1750221.4382962.622611-0.000090-0.004313-0.1196872.1665973.7678998.0355784.9235876.152561-0.007616-0.0721990.0525213.980608-0.000069-0.000103-0.0270720.367801-0.003337-0.0104221.127201-0.0000793.496159-0.079587-8.000000e-086.799802-0.078682-0.04.140815
zip/100051000513.730000NYNew York CountyNew York863740.705974-74.008768-0.1428610.7785213.6405631.7745480.1031400.4621981.2381120.3626211.0670660.2403921.0121440.4501932.0653492.0761803.0301343.4653731.8889400.3740232.3237220.3071620.2439690.2053385.6401882.588103-0.0153750.3245943.7378641.6779350.3796594.3782352.0422961.018322-0.1093114.7153981.7573642.5008150.8889961.9595772.2038780.2273360.2436043.124810...-0.117290-0.084119-0.169264-0.115257-0.010591-0.0262481.397036-0.000000e+00-0.000242-0.1674690.550521-0.013257-0.0-3.500000e-07-0.1569890.212693-0.1332203.3823430.7180640.7326485.0143840.1573892.529759-0.000866-0.002382-0.0721323.9055902.8311048.1657866.1364556.366154-0.0074920.567445-0.1017463.834963-0.009529-0.0005050.0286451.454115-0.001952-0.0118452.385040-0.0000173.454340-0.159854-1.320000e-067.295258-0.169108-0.03.934241
\n", - "

5 rows × 338 columns

\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "data" - } - }, - "metadata": {}, - "execution_count": 17 - } - ] - }, - { - "cell_type": "code", - "source": [ - "data.shape" - ], - "metadata": { - "id": "UUKLoxY6JkUt", - "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 18, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(29298, 338)" - ] - }, - "metadata": {}, - "execution_count": 18 - } - ] - }, - { - "cell_type": "code", - "source": [ - "embedding_features = [f\"feature{x}\" for x in range(330)]\n", - "label = \"pm25\"" - ], - "metadata": { - "id": "NvHQ97WYJmOd" - }, - "execution_count": 19, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "data = data.dropna(subset=[label])" - ], - "metadata": { - "id": "35igKORxJ15A" - }, - "execution_count": 20, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Split Train and Test Data" - ], - "metadata": { - "id": "tuCYm8dcevqo" - } - }, - { - "cell_type": "code", - "source": [ - "\n", - "data = data[embedding_features + [label]]\n", - "X = data[embedding_features]\n", - "y = data[label]\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, y, test_size=0.2, random_state=42\n", - ")" - ], - "metadata": { - "id": "IgSmPhJTJ2QW" - }, - "execution_count": 21, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Fit K-Nearest Neighbors Model" - ], - "metadata": { - "id": "iHys75z1fFFJ" - } - }, - { - "cell_type": "code", - "source": [ - "k = 5\n", - "model = KNeighborsRegressor(n_neighbors=k)\n", - "model.fit(X_train, y_train)\n", - "\n", - "y_pred = model.predict(X_test)" - ], - "metadata": { - "id": "KCgs-cyoJ5Nm" - }, - "execution_count": 22, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "\n", - "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", - "# Evaluate the model\n", - "metrics = evaluate_model(evaluation_df)\n", - "print(metrics)" - ], - "metadata": { - "id": "TkWTkqPCKEYu", - "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "execution_count": 23, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'r2': 0.9423382724431623, 'r': 0.9708871328512507, 'rmse': 0.6494730499555952, 'mae': 0.4204252179218217, 'mape': 0.05619889943563576}\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Evaluate K-Nearest Neighbors Model" - ], - "metadata": { - "id": "ywbC7hLSfMZB" - } - }, - { - "cell_type": "code", - "source": [ - "xy_lim = (0, 30)\n", - "plot_actual_vs_predicted(\n", - " evaluation_df,\n", - " xlim=xy_lim,\n", - " ylim=xy_lim,\n", - " title=\"Actual vs Predicted PM2.5\",\n", - " x_label=\"Actual PM2.5\",\n", - " y_label=\"Predicted PM2.5\",\n", - ")" - ], - "metadata": { - "id": "LZORS0JQKjQ4", - "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 817 - } - }, - "execution_count": 28, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "SjOVAvPYkDYx" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Fit Random Forest Regressor model" - ], - "metadata": { - "id": "LPXkNlenjGp_" - } - }, - { - "cell_type": "code", - "source": [ - "model = RandomForestRegressor(n_estimators=10,verbose=10,n_jobs=-1)\n", - "model.fit(X_train, y_train)\n", - "\n", - "y_pred = model.predict(X_test)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "M7S4naYnjN4l", - "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9" - }, - "execution_count": 29, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "building tree 1 of 10building tree 2 of 10\n", - "\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 20.0s\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "building tree 3 of 10\n", - "building tree 4 of 10\n", - "building tree 5 of 10\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[Parallel(n_jobs=-1)]: Done 4 tasks | elapsed: 38.8s\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "building tree 6 of 10\n", - "building tree 7 of 10\n", - "building tree 8 of 10\n", - "building tree 9 of 10\n", - "building tree 10 of 10\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.6min finished\n", - "[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.\n", - "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 0.0s\n", - "[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 0.0s finished\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", - "# Evaluate the model\n", - "metrics = evaluate_model(evaluation_df)\n", - "print(metrics)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CaxFfeEtjRiH", - "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3" - }, - "execution_count": 30, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{'r2': 0.9086928793143653, 'r': 0.9539294211355118, 'rmse': 0.817277670967286, 'mae': 0.568816793219495, 'mape': 0.07572915522396176}\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Evaluate Random Forest Model" - ], - "metadata": { - "id": "v96jVyGRjWtE" - } - }, - { - "cell_type": "code", - "source": [ - "xy_lim = (0, 30)\n", - "plot_actual_vs_predicted(\n", - " evaluation_df,\n", - " xlim=xy_lim,\n", - " ylim=xy_lim,\n", - " title=\"Actual vs Predicted PM2.5\",\n", - " x_label=\"Actual PM2.5\",\n", - " y_label=\"Predicted PM2.5\",\n", - ")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 817 - }, - "id": "bVnYyd-zjb85", - "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d" - }, - "execution_count": 31, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ] - } - ], - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tE3akitQdA-m" + }, + "source": [ + "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n", + "\n", + "Useful Resources:\n", + "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n", + "2. https://github.com/google-research/population-dynamics/tree/master/notebooks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TbTs9lKqddKS" + }, + "source": [ + "Acknowledgements:\n", + "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JqjIF4kAKZGR" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install leafmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import os\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0z3FKT1fgePa" + }, + "source": [ + "# Get US PM2.5 data\n", + "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8VYkB_TeA0kP" + }, + "outputs": [], + "source": [ + "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part1\")\n", + "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part2\")\n", + "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part3\")\n", + "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part4\")\n", + "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part5\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GYC_GWkdgodj" + }, + "source": [ + "# Process PM2.5 data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "QE3MZiRYA6q1", + "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5" + }, + "outputs": [], + "source": [ + "df = pd.concat([df0, df1, df2, df3, df4], ignore_index=True)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "p9NCYMyxGBQB", + "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e" + }, + "outputs": [], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ip7NBwl-EDsf", + "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221" + }, + "outputs": [], + "source": [ + "df[\"zcta\"].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "id": "8qtI9VgiCcvj", + "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4" + }, + "outputs": [], + "source": [ + "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 272 + }, + "id": "6-qSy8lVDDoM", + "outputId": "1f086442-221a-4f64-ee9d-db456bb21808" + }, + "outputs": [], + "source": [ + "pm25_df.dropna(axis=0, inplace=True)\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 290 + }, + "id": "5kAxQ3uVGswD", + "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3" + }, + "outputs": [], + "source": [ + "pm25_df.index = pm25_df.index.astype(int)\n", + "print(pm25_df.shape)\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + }, + "id": "dBOUhYaOIU0-", + "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc" + }, + "outputs": [], + "source": [ + "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n", + "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n", + "pm25_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0CUwjIxaeWCm" + }, + "source": [ + "# Request access to PDFM Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "yxEkU1GjibgC", + "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218" + }, + "outputs": [], + "source": [ + "!unzip /content/pdfm_embeddings.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-DugeFaSG2Pi" + }, + "outputs": [], + "source": [ + "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JZZX7tzlHYa9" + }, + "outputs": [], + "source": [ + "if not os.path.exists(embeddings_file_path):\n", + " raise FileNotFoundError(\"Please request the embeddings from Google\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 374 + }, + "id": "cWNluHJYHY57", + "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd" + }, + "outputs": [], + "source": [ + "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n", + "zipcode_embeddings.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L7XAkE0fecVU" + }, + "source": [ + "# Join PDFM embeddings and Groud Truth (PM2.5 data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 429 + }, + "id": "KjwmUj5SH9-P", + "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433" + }, + "outputs": [], + "source": [ + "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UUKLoxY6JkUt", + "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2" + }, + "outputs": [], + "source": [ + "data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NvHQ97WYJmOd" + }, + "outputs": [], + "source": [ + "embedding_features = [f\"feature{x}\" for x in range(330)]\n", + "label = \"pm25\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "35igKORxJ15A" + }, + "outputs": [], + "source": [ + "data = data.dropna(subset=[label])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tuCYm8dcevqo" + }, + "source": [ + "# Split Train and Test Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IgSmPhJTJ2QW" + }, + "outputs": [], + "source": [ + "data = data[embedding_features + [label]]\n", + "X = data[embedding_features]\n", + "y = data[label]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.2, random_state=42\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iHys75z1fFFJ" + }, + "source": [ + "# Fit K-Nearest Neighbors Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KCgs-cyoJ5Nm" + }, + "outputs": [], + "source": [ + "k = 5\n", + "model = KNeighborsRegressor(n_neighbors=k)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TkWTkqPCKEYu", + "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9" + }, + "outputs": [], + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ywbC7hLSfMZB" + }, + "source": [ + "# Evaluate K-Nearest Neighbors Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "id": "LZORS0JQKjQ4", + "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3" + }, + "outputs": [], + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SjOVAvPYkDYx" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LPXkNlenjGp_" + }, + "source": [ + "# Fit Random Forest Regressor model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M7S4naYnjN4l", + "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9" + }, + "outputs": [], + "source": [ + "model = RandomForestRegressor(n_estimators=10, verbose=10, n_jobs=-1)\n", + "model.fit(X_train, y_train)\n", + "\n", + "y_pred = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CaxFfeEtjRiH", + "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3" + }, + "outputs": [], + "source": [ + "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n", + "# Evaluate the model\n", + "metrics = evaluate_model(evaluation_df)\n", + "print(metrics)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v96jVyGRjWtE" + }, + "source": [ + "# Evaluate Random Forest Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 817 + }, + "id": "bVnYyd-zjb85", + "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d" + }, + "outputs": [], + "source": [ + "xy_lim = (0, 30)\n", + "plot_actual_vs_predicted(\n", + " evaluation_df,\n", + " xlim=xy_lim,\n", + " ylim=xy_lim,\n", + " title=\"Actual vs Predicted PM2.5\",\n", + " x_label=\"Actual PM2.5\",\n", + " y_label=\"Predicted PM2.5\",\n", + ")" + ] + } + ], + "metadata": { + "colab": { + "include_colab_link": true, + "provenance": [] }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}