From ab12976b4a17764be14dbf0ada4eb1e2cea2cb4f Mon Sep 17 00:00:00 2001
From: Aninda Goswamy <39881731+anindabitm@users.noreply.github.com>
Date: Thu, 20 Feb 2025 16:02:11 +0530
Subject: [PATCH 1/2] Predicting US PM2.5 levels using PDFM
Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model
---
PDFM_embeddings_to_predict_PM2_5_in_US.ipynb | 3522 ++++++++++++++++++
1 file changed, 3522 insertions(+)
create mode 100644 PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
diff --git a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
new file mode 100644
index 0000000..ce42a4f
--- /dev/null
+++ b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
@@ -0,0 +1,3522 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n",
+ "\n",
+ "Useful Resources:\n",
+ "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n",
+ "2. https://github.com/google-research/population-dynamics/tree/master/notebooks"
+ ],
+ "metadata": {
+ "id": "tE3akitQdA-m"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Acknowledgements:\n",
+ "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)"
+ ],
+ "metadata": {
+ "id": "TbTs9lKqddKS"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "%%capture\n",
+ "!pip install leafmap"
+ ],
+ "metadata": {
+ "id": "JqjIF4kAKZGR"
+ },
+ "execution_count": 2,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "lIYdn1woOS1n"
+ },
+ "outputs": [],
+ "source": [
+ "#import libraries\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.neighbors import KNeighborsRegressor\n",
+ "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Get US PM2.5 data\n",
+ "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about"
+ ],
+ "metadata": {
+ "id": "0z3FKT1fgePa"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part1\")\n",
+ "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part2\")\n",
+ "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part3\")\n",
+ "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part4\")\n",
+ "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part5\")"
+ ],
+ "metadata": {
+ "id": "8VYkB_TeA0kP"
+ },
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Process PM2.5 data"
+ ],
+ "metadata": {
+ "id": "GYC_GWkdgodj"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df = pd.concat([df0,df1,df2,df3,df4],ignore_index=True)\n",
+ "df.head()"
+ ],
+ "metadata": {
+ "id": "QE3MZiRYA6q1",
+ "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ }
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Unnamed: 0 year ZIP zcta popdensity ... asian_pop white_pop medhouseholdincome pm25 urban\n",
+ "0 1 2000 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
+ "1 2 2001 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
+ "2 3 2002 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
+ "3 4 2003 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
+ "4 5 2004 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " year | \n",
+ " ZIP | \n",
+ " zcta | \n",
+ " popdensity | \n",
+ " population | \n",
+ " poverty | \n",
+ " education | \n",
+ " pct_blk | \n",
+ " pct_hisp | \n",
+ " pct_native | \n",
+ " pct_asian | \n",
+ " pct_white | \n",
+ " black_pop | \n",
+ " hisp_pop | \n",
+ " native_pop | \n",
+ " asian_pop | \n",
+ " white_pop | \n",
+ " medhouseholdincome | \n",
+ " pm25 | \n",
+ " urban | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 2000 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 2001 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 2002 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " 2003 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 2004 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df"
+ }
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.shape"
+ ],
+ "metadata": {
+ "id": "p9NCYMyxGBQB",
+ "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(789260, 21)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df[\"zcta\"].nunique()"
+ ],
+ "metadata": {
+ "id": "ip7NBwl-EDsf",
+ "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "32406"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n",
+ "pm25_df.head()"
+ ],
+ "metadata": {
+ "id": "8qtI9VgiCcvj",
+ "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 272
+ }
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "zcta\n",
+ "601.0 NaN\n",
+ "602.0 NaN\n",
+ "603.0 NaN\n",
+ "606.0 NaN\n",
+ "610.0 NaN\n",
+ "Name: pm25, dtype: float64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pm25 | \n",
+ "
\n",
+ " \n",
+ " | zcta | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 601.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 602.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 603.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 606.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 610.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pm25_df.dropna(axis=0,inplace=True)\n",
+ "pm25_df.head()"
+ ],
+ "metadata": {
+ "id": "6-qSy8lVDDoM",
+ "outputId": "1f086442-221a-4f64-ee9d-db456bb21808",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 272
+ }
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "zcta\n",
+ "1001.0 9.398180\n",
+ "1002.0 8.026795\n",
+ "1003.0 8.949020\n",
+ "1005.0 6.409811\n",
+ "1007.0 7.375929\n",
+ "Name: pm25, dtype: float64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pm25 | \n",
+ "
\n",
+ " \n",
+ " | zcta | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1001.0 | \n",
+ " 9.398180 | \n",
+ "
\n",
+ " \n",
+ " | 1002.0 | \n",
+ " 8.026795 | \n",
+ "
\n",
+ " \n",
+ " | 1003.0 | \n",
+ " 8.949020 | \n",
+ "
\n",
+ " \n",
+ " | 1005.0 | \n",
+ " 6.409811 | \n",
+ "
\n",
+ " \n",
+ " | 1007.0 | \n",
+ " 7.375929 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pm25_df.index = pm25_df.index.astype(int)\n",
+ "print(pm25_df.shape)\n",
+ "pm25_df.head()"
+ ],
+ "metadata": {
+ "id": "5kAxQ3uVGswD",
+ "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 290
+ }
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "(31956,)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "zcta\n",
+ "1001 9.398180\n",
+ "1002 8.026795\n",
+ "1003 8.949020\n",
+ "1005 6.409811\n",
+ "1007 7.375929\n",
+ "Name: pm25, dtype: float64"
+ ],
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pm25 | \n",
+ "
\n",
+ " \n",
+ " | zcta | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1001 | \n",
+ " 9.398180 | \n",
+ "
\n",
+ " \n",
+ " | 1002 | \n",
+ " 8.026795 | \n",
+ "
\n",
+ " \n",
+ " | 1003 | \n",
+ " 8.949020 | \n",
+ "
\n",
+ " \n",
+ " | 1005 | \n",
+ " 6.409811 | \n",
+ "
\n",
+ " \n",
+ " | 1007 | \n",
+ " 7.375929 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n",
+ "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n",
+ "pm25_df.head()"
+ ],
+ "metadata": {
+ "id": "dBOUhYaOIU0-",
+ "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 238
+ }
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " zcta pm25\n",
+ "zcta \n",
+ "zip/1001 1001 9.398180\n",
+ "zip/1002 1002 8.026795\n",
+ "zip/1003 1003 8.949020\n",
+ "zip/1005 1005 6.409811\n",
+ "zip/1007 1007 7.375929"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " zcta | \n",
+ " pm25 | \n",
+ "
\n",
+ " \n",
+ " | zcta | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | zip/1001 | \n",
+ " 1001 | \n",
+ " 9.398180 | \n",
+ "
\n",
+ " \n",
+ " | zip/1002 | \n",
+ " 1002 | \n",
+ " 8.026795 | \n",
+ "
\n",
+ " \n",
+ " | zip/1003 | \n",
+ " 1003 | \n",
+ " 8.949020 | \n",
+ "
\n",
+ " \n",
+ " | zip/1005 | \n",
+ " 1005 | \n",
+ " 6.409811 | \n",
+ "
\n",
+ " \n",
+ " | zip/1007 | \n",
+ " 1007 | \n",
+ " 7.375929 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "pm25_df",
+ "repr_error": "cannot insert zcta, already exists"
+ }
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Request access to PDFM Embeddings"
+ ],
+ "metadata": {
+ "id": "0CUwjIxaeWCm"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!unzip /content/pdfm_embeddings.zip"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yxEkU1GjibgC",
+ "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Archive: /content/pdfm_embeddings.zip\n",
+ " creating: pdfm_embeddings/\n",
+ " creating: pdfm_embeddings/v0/\n",
+ " creating: pdfm_embeddings/v0/us/\n",
+ " inflating: pdfm_embeddings/v0/us/county.geojson \n",
+ " inflating: pdfm_embeddings/v0/us/county_embeddings.csv \n",
+ " inflating: pdfm_embeddings/v0/us/zcta.geojson \n",
+ " inflating: pdfm_embeddings/v0/us/zcta_embeddings.csv \n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\""
+ ],
+ "metadata": {
+ "id": "-DugeFaSG2Pi"
+ },
+ "execution_count": 12,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "if not os.path.exists(embeddings_file_path):\n",
+ " raise FileNotFoundError(\"Please request the embeddings from Google\")"
+ ],
+ "metadata": {
+ "id": "JZZX7tzlHYa9"
+ },
+ "execution_count": 15,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n",
+ "zipcode_embeddings.head()"
+ ],
+ "metadata": {
+ "id": "cWNluHJYHY57",
+ "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 374
+ }
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " state county city ... feature327 feature328 feature329\n",
+ "place ... \n",
+ "zip/97910 OR Malheur County Jordan Valley ... -0.001661 -0.001010 4.495589\n",
+ "zip/89412 NV Washoe County Gerlach ... -0.024385 -0.000295 3.399393\n",
+ "zip/88030 NM Luna County Deming ... -0.116499 -0.051163 3.866543\n",
+ "zip/82633 WY Converse County Douglas ... -0.047864 -0.000042 7.453567\n",
+ "zip/59538 MT Phillips County Malta ... -0.161916 -0.001087 0.972243\n",
+ "\n",
+ "[5 rows x 336 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " state | \n",
+ " county | \n",
+ " city | \n",
+ " population | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " feature0 | \n",
+ " feature1 | \n",
+ " feature2 | \n",
+ " feature3 | \n",
+ " feature4 | \n",
+ " feature5 | \n",
+ " feature6 | \n",
+ " feature7 | \n",
+ " feature8 | \n",
+ " feature9 | \n",
+ " feature10 | \n",
+ " feature11 | \n",
+ " feature12 | \n",
+ " feature13 | \n",
+ " feature14 | \n",
+ " feature15 | \n",
+ " feature16 | \n",
+ " feature17 | \n",
+ " feature18 | \n",
+ " feature19 | \n",
+ " feature20 | \n",
+ " feature21 | \n",
+ " feature22 | \n",
+ " feature23 | \n",
+ " feature24 | \n",
+ " feature25 | \n",
+ " feature26 | \n",
+ " feature27 | \n",
+ " feature28 | \n",
+ " feature29 | \n",
+ " feature30 | \n",
+ " feature31 | \n",
+ " feature32 | \n",
+ " feature33 | \n",
+ " feature34 | \n",
+ " feature35 | \n",
+ " feature36 | \n",
+ " feature37 | \n",
+ " feature38 | \n",
+ " feature39 | \n",
+ " feature40 | \n",
+ " feature41 | \n",
+ " feature42 | \n",
+ " feature43 | \n",
+ " ... | \n",
+ " feature280 | \n",
+ " feature281 | \n",
+ " feature282 | \n",
+ " feature283 | \n",
+ " feature284 | \n",
+ " feature285 | \n",
+ " feature286 | \n",
+ " feature287 | \n",
+ " feature288 | \n",
+ " feature289 | \n",
+ " feature290 | \n",
+ " feature291 | \n",
+ " feature292 | \n",
+ " feature293 | \n",
+ " feature294 | \n",
+ " feature295 | \n",
+ " feature296 | \n",
+ " feature297 | \n",
+ " feature298 | \n",
+ " feature299 | \n",
+ " feature300 | \n",
+ " feature301 | \n",
+ " feature302 | \n",
+ " feature303 | \n",
+ " feature304 | \n",
+ " feature305 | \n",
+ " feature306 | \n",
+ " feature307 | \n",
+ " feature308 | \n",
+ " feature309 | \n",
+ " feature310 | \n",
+ " feature311 | \n",
+ " feature312 | \n",
+ " feature313 | \n",
+ " feature314 | \n",
+ " feature315 | \n",
+ " feature316 | \n",
+ " feature317 | \n",
+ " feature318 | \n",
+ " feature319 | \n",
+ " feature320 | \n",
+ " feature321 | \n",
+ " feature322 | \n",
+ " feature323 | \n",
+ " feature324 | \n",
+ " feature325 | \n",
+ " feature326 | \n",
+ " feature327 | \n",
+ " feature328 | \n",
+ " feature329 | \n",
+ "
\n",
+ " \n",
+ " | place | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | zip/97910 | \n",
+ " OR | \n",
+ " Malheur County | \n",
+ " Jordan Valley | \n",
+ " 609 | \n",
+ " 42.749076 | \n",
+ " -117.511459 | \n",
+ " -0.138227 | \n",
+ " 1.120377 | \n",
+ " 0.072900 | \n",
+ " 0.297442 | \n",
+ " 0.772673 | \n",
+ " 0.286467 | \n",
+ " 0.802398 | \n",
+ " 0.620847 | \n",
+ " 0.060810 | \n",
+ " 0.125926 | \n",
+ " 0.452905 | \n",
+ " 0.317210 | \n",
+ " 1.560488 | \n",
+ " 0.173717 | \n",
+ " 0.338584 | \n",
+ " -0.011876 | \n",
+ " 0.369918 | \n",
+ " 0.734241 | \n",
+ " -0.023161 | \n",
+ " 0.927918 | \n",
+ " 0.131129 | \n",
+ " 0.174915 | \n",
+ " 0.186962 | \n",
+ " 0.777327 | \n",
+ " -0.125255 | \n",
+ " 0.252997 | \n",
+ " 0.126703 | \n",
+ " 0.282713 | \n",
+ " 0.286217 | \n",
+ " 0.108222 | \n",
+ " 0.138043 | \n",
+ " 1.893893 | \n",
+ " 0.034341 | \n",
+ " 0.116197 | \n",
+ " 1.578618 | \n",
+ " 0.594598 | \n",
+ " 0.083396 | \n",
+ " 0.217705 | \n",
+ " 0.085833 | \n",
+ " 0.050959 | \n",
+ " 0.071389 | \n",
+ " 0.046794 | \n",
+ " 0.900741 | \n",
+ " 0.089737 | \n",
+ " ... | \n",
+ " 0.795463 | \n",
+ " 4.048826 | \n",
+ " 4.411071 | \n",
+ " -0.072585 | \n",
+ " 1.172804 | \n",
+ " 2.780721 | \n",
+ " -0.007037 | \n",
+ " -0.167074 | \n",
+ " -0.169071 | \n",
+ " 2.451642 | \n",
+ " 4.705745 | \n",
+ " 2.861044 | \n",
+ " 0.964761 | \n",
+ " 5.425952 | \n",
+ " -0.086446 | \n",
+ " -0.004044 | \n",
+ " 1.692680 | \n",
+ " -0.129304 | \n",
+ " 1.262767 | \n",
+ " 0.584393 | \n",
+ " 4.957980 | \n",
+ " -0.113619 | \n",
+ " 3.977844 | \n",
+ " -0.056266 | \n",
+ " 0.154679 | \n",
+ " 6.833614 | \n",
+ " -0.168595 | \n",
+ " 3.852105 | \n",
+ " -0.008600 | \n",
+ " 0.168367 | \n",
+ " 2.672679 | \n",
+ " 6.938071 | \n",
+ " 1.462526 | \n",
+ " 4.700379 | \n",
+ " 3.523755 | \n",
+ " -0.169971 | \n",
+ " 0.279797 | \n",
+ " -0.030630 | \n",
+ " -0.000014 | \n",
+ " 4.489360 | \n",
+ " -0.158891 | \n",
+ " -0.168708 | \n",
+ " 1.231994 | \n",
+ " -0.155765 | \n",
+ " 3.043214 | \n",
+ " -0.169749 | \n",
+ " 0.177463 | \n",
+ " -0.001661 | \n",
+ " -0.001010 | \n",
+ " 4.495589 | \n",
+ "
\n",
+ " \n",
+ " | zip/89412 | \n",
+ " NV | \n",
+ " Washoe County | \n",
+ " Gerlach | \n",
+ " 98 | \n",
+ " 41.102934 | \n",
+ " -119.695361 | \n",
+ " -0.141379 | \n",
+ " 1.422782 | \n",
+ " 0.234269 | \n",
+ " 0.159156 | \n",
+ " 0.890241 | \n",
+ " 0.215427 | \n",
+ " 0.533200 | \n",
+ " 1.125830 | \n",
+ " 0.159891 | \n",
+ " 0.305449 | \n",
+ " 0.673448 | \n",
+ " 0.222371 | \n",
+ " 1.113196 | \n",
+ " 0.147047 | \n",
+ " 0.270858 | \n",
+ " 0.140051 | \n",
+ " 0.591381 | \n",
+ " 1.321858 | \n",
+ " 0.182956 | \n",
+ " 1.600126 | \n",
+ " 0.149480 | \n",
+ " 1.239454 | \n",
+ " 0.326186 | \n",
+ " 0.724274 | \n",
+ " -0.057049 | \n",
+ " 0.327722 | \n",
+ " 0.225044 | \n",
+ " 0.092865 | \n",
+ " 0.636425 | \n",
+ " 0.541034 | \n",
+ " 0.168866 | \n",
+ " 1.489893 | \n",
+ " -0.003255 | \n",
+ " 0.524351 | \n",
+ " 1.672642 | \n",
+ " 0.421262 | \n",
+ " 0.701539 | \n",
+ " 0.231094 | \n",
+ " 0.223512 | \n",
+ " 0.134480 | \n",
+ " 0.479852 | \n",
+ " 0.204405 | \n",
+ " 1.043766 | \n",
+ " 0.396896 | \n",
+ " ... | \n",
+ " -0.080731 | \n",
+ " 5.357723 | \n",
+ " 3.973490 | \n",
+ " -0.100555 | \n",
+ " 2.942436 | \n",
+ " 4.120401 | \n",
+ " -0.027611 | \n",
+ " -0.084821 | \n",
+ " -0.000068 | \n",
+ " 1.364740 | \n",
+ " 6.085680 | \n",
+ " 5.550473 | \n",
+ " -0.057929 | \n",
+ " 7.677364 | \n",
+ " -0.111488 | \n",
+ " 0.442242 | \n",
+ " 2.686013 | \n",
+ " -0.008651 | \n",
+ " -0.003287 | \n",
+ " 4.685384 | \n",
+ " 6.117020 | \n",
+ " -0.083324 | \n",
+ " 4.441121 | \n",
+ " -0.002397 | \n",
+ " -0.119943 | \n",
+ " 4.539424 | \n",
+ " -0.006676 | \n",
+ " 4.993636 | \n",
+ " -0.118508 | \n",
+ " -0.169039 | \n",
+ " 1.864953 | \n",
+ " 4.146715 | \n",
+ " -0.118220 | \n",
+ " 3.688882 | \n",
+ " 4.046134 | \n",
+ " -0.045537 | \n",
+ " 1.627209 | \n",
+ " -0.012242 | \n",
+ " -0.016643 | \n",
+ " 4.668972 | \n",
+ " -0.157417 | \n",
+ " -0.043606 | \n",
+ " 2.788701 | \n",
+ " -0.062547 | \n",
+ " 3.700745 | \n",
+ " -0.169827 | \n",
+ " -0.137990 | \n",
+ " -0.024385 | \n",
+ " -0.000295 | \n",
+ " 3.399393 | \n",
+ "
\n",
+ " \n",
+ " | zip/88030 | \n",
+ " NM | \n",
+ " Luna County | \n",
+ " Deming | \n",
+ " 24139 | \n",
+ " 32.191634 | \n",
+ " -107.729431 | \n",
+ " -0.046666 | \n",
+ " 1.414424 | \n",
+ " 0.146803 | \n",
+ " 1.113256 | \n",
+ " 1.119576 | \n",
+ " 1.093199 | \n",
+ " 0.960636 | \n",
+ " 0.179642 | \n",
+ " 0.729488 | \n",
+ " 2.447439 | \n",
+ " 2.274204 | \n",
+ " 2.765325 | \n",
+ " 0.903284 | \n",
+ " 0.520162 | \n",
+ " 2.604348 | \n",
+ " 0.688520 | \n",
+ " 0.164436 | \n",
+ " 2.755828 | \n",
+ " 1.312246 | \n",
+ " 0.452269 | \n",
+ " 0.612660 | \n",
+ " 1.135295 | \n",
+ " 1.440466 | \n",
+ " 0.507069 | \n",
+ " -0.140809 | \n",
+ " 0.481306 | \n",
+ " 1.068717 | \n",
+ " 2.188697 | \n",
+ " 0.254398 | \n",
+ " 1.019234 | \n",
+ " 0.277727 | \n",
+ " 0.716491 | \n",
+ " 0.861136 | \n",
+ " 1.232256 | \n",
+ " 0.210326 | \n",
+ " 0.694031 | \n",
+ " 1.504018 | \n",
+ " 1.430361 | \n",
+ " 0.842551 | \n",
+ " 0.222043 | \n",
+ " 1.114556 | \n",
+ " 0.856425 | \n",
+ " 1.518791 | \n",
+ " 1.487212 | \n",
+ " ... | \n",
+ " 0.472449 | \n",
+ " 4.089555 | \n",
+ " 1.347347 | \n",
+ " 0.128349 | \n",
+ " 2.517892 | \n",
+ " 0.718034 | \n",
+ " 0.546053 | \n",
+ " 2.494347 | \n",
+ " -0.024888 | \n",
+ " 3.658147 | \n",
+ " 5.096304 | \n",
+ " 3.687950 | \n",
+ " -0.162521 | \n",
+ " 4.844636 | \n",
+ " 0.789083 | \n",
+ " 3.025670 | \n",
+ " 0.667166 | \n",
+ " -0.169164 | \n",
+ " -0.109891 | \n",
+ " 3.095128 | \n",
+ " 4.816823 | \n",
+ " -0.169883 | \n",
+ " 4.487709 | \n",
+ " -0.058803 | \n",
+ " -0.167333 | \n",
+ " 2.843048 | \n",
+ " -0.060544 | \n",
+ " 3.279263 | \n",
+ " -0.158699 | \n",
+ " 1.535189 | \n",
+ " 2.787231 | \n",
+ " 3.861916 | \n",
+ " 1.569119 | \n",
+ " 3.487299 | \n",
+ " 2.334693 | \n",
+ " 0.068785 | \n",
+ " -0.162307 | \n",
+ " -0.053849 | \n",
+ " 0.160504 | \n",
+ " 1.895565 | \n",
+ " -0.000654 | \n",
+ " 0.437475 | \n",
+ " 4.229295 | \n",
+ " 0.229199 | \n",
+ " 2.098469 | \n",
+ " 1.150497 | \n",
+ " 0.716122 | \n",
+ " -0.116499 | \n",
+ " -0.051163 | \n",
+ " 3.866543 | \n",
+ "
\n",
+ " \n",
+ " | zip/82633 | \n",
+ " WY | \n",
+ " Converse County | \n",
+ " Douglas | \n",
+ " 9478 | \n",
+ " 43.022270 | \n",
+ " -105.410250 | \n",
+ " -0.090293 | \n",
+ " 1.266280 | \n",
+ " 0.447868 | \n",
+ " 0.781861 | \n",
+ " 1.731813 | \n",
+ " 0.602722 | \n",
+ " 0.737066 | \n",
+ " 0.743392 | \n",
+ " 0.823658 | \n",
+ " 1.475200 | \n",
+ " 1.639734 | \n",
+ " 0.202340 | \n",
+ " 0.545946 | \n",
+ " 0.486171 | \n",
+ " 0.425758 | \n",
+ " 0.951557 | \n",
+ " 0.448131 | \n",
+ " 0.889409 | \n",
+ " 1.116265 | \n",
+ " 0.331308 | \n",
+ " 0.694245 | \n",
+ " 0.774092 | \n",
+ " 0.893476 | \n",
+ " 2.088896 | \n",
+ " -0.012767 | \n",
+ " 1.379420 | \n",
+ " 0.541944 | \n",
+ " 0.903094 | \n",
+ " 1.245158 | \n",
+ " 0.514747 | \n",
+ " 0.240520 | \n",
+ " 1.853385 | \n",
+ " 0.691478 | \n",
+ " 1.095086 | \n",
+ " 0.102779 | \n",
+ " 0.976397 | \n",
+ " 1.508152 | \n",
+ " 1.098709 | \n",
+ " 0.658931 | \n",
+ " 1.544933 | \n",
+ " 3.267990 | \n",
+ " 1.033022 | \n",
+ " 0.948243 | \n",
+ " 0.768377 | \n",
+ " ... | \n",
+ " 4.709711 | \n",
+ " 2.619931 | \n",
+ " 0.377791 | \n",
+ " -0.136090 | \n",
+ " 1.546929 | \n",
+ " 1.914665 | \n",
+ " -0.038279 | \n",
+ " -0.158291 | \n",
+ " 3.846224 | \n",
+ " 1.600872 | \n",
+ " 2.556240 | \n",
+ " 5.028241 | \n",
+ " 3.131569 | \n",
+ " 1.885251 | \n",
+ " 1.723152 | \n",
+ " 3.287659 | \n",
+ " 0.592335 | \n",
+ " -0.169679 | \n",
+ " 0.799571 | \n",
+ " 1.711086 | \n",
+ " 6.434799 | \n",
+ " 2.259457 | \n",
+ " 5.137226 | \n",
+ " -0.157376 | \n",
+ " 3.739257 | \n",
+ " 1.849344 | \n",
+ " 0.817178 | \n",
+ " 4.254727 | \n",
+ " -0.031455 | \n",
+ " 0.860355 | \n",
+ " 3.185768 | \n",
+ " 4.815537 | \n",
+ " 1.889562 | \n",
+ " 3.147158 | \n",
+ " 5.902875 | \n",
+ " 0.248916 | \n",
+ " -0.013526 | \n",
+ " -0.035991 | \n",
+ " -0.037467 | \n",
+ " 2.813852 | \n",
+ " -0.033771 | \n",
+ " 0.579775 | \n",
+ " 2.688665 | \n",
+ " 0.175669 | \n",
+ " 0.990921 | \n",
+ " 1.644879 | \n",
+ " 0.222517 | \n",
+ " -0.047864 | \n",
+ " -0.000042 | \n",
+ " 7.453567 | \n",
+ "
\n",
+ " \n",
+ " | zip/59538 | \n",
+ " MT | \n",
+ " Phillips County | \n",
+ " Malta | \n",
+ " 2936 | \n",
+ " 48.112019 | \n",
+ " -107.845520 | \n",
+ " -0.092886 | \n",
+ " 1.256203 | \n",
+ " -0.050897 | \n",
+ " 0.321954 | \n",
+ " 1.281864 | \n",
+ " 0.737793 | \n",
+ " 1.662178 | \n",
+ " 0.451061 | \n",
+ " 0.190265 | \n",
+ " -0.127765 | \n",
+ " 0.506115 | \n",
+ " 0.792137 | \n",
+ " 0.385507 | \n",
+ " 0.394926 | \n",
+ " 0.477761 | \n",
+ " 1.028206 | \n",
+ " 0.047681 | \n",
+ " 0.879740 | \n",
+ " 0.795730 | \n",
+ " 0.239135 | \n",
+ " 0.282084 | \n",
+ " 0.193326 | \n",
+ " 1.262094 | \n",
+ " 0.453796 | \n",
+ " -0.169351 | \n",
+ " 0.601323 | \n",
+ " 0.670364 | \n",
+ " 0.581992 | \n",
+ " 0.540012 | \n",
+ " 0.218976 | \n",
+ " 1.195483 | \n",
+ " 0.035199 | \n",
+ " 0.274211 | \n",
+ " 0.681594 | \n",
+ " 0.819916 | \n",
+ " 1.234735 | \n",
+ " 0.289213 | \n",
+ " -0.010891 | \n",
+ " 0.911312 | \n",
+ " 0.780166 | \n",
+ " 2.906506 | \n",
+ " 0.524723 | \n",
+ " 1.004237 | \n",
+ " -0.098108 | \n",
+ " ... | \n",
+ " 0.741410 | \n",
+ " 0.435825 | \n",
+ " 0.415687 | \n",
+ " -0.168535 | \n",
+ " 1.068465 | \n",
+ " -0.014837 | \n",
+ " -0.058268 | \n",
+ " -0.168225 | \n",
+ " -0.084327 | \n",
+ " 4.262060 | \n",
+ " 0.444936 | \n",
+ " 2.504024 | \n",
+ " 0.534612 | \n",
+ " 1.366006 | \n",
+ " 0.086276 | \n",
+ " 1.766271 | \n",
+ " 3.652062 | \n",
+ " -0.162912 | \n",
+ " -0.102837 | \n",
+ " 2.123431 | \n",
+ " 6.335544 | \n",
+ " -0.158536 | \n",
+ " 2.916174 | \n",
+ " -0.000554 | \n",
+ " 4.013170 | \n",
+ " 1.245277 | \n",
+ " -0.146109 | \n",
+ " 1.630525 | \n",
+ " 0.193676 | \n",
+ " -0.132476 | \n",
+ " 1.447661 | \n",
+ " 4.800499 | \n",
+ " -0.009952 | \n",
+ " 0.217168 | \n",
+ " 1.960558 | \n",
+ " -0.080472 | \n",
+ " 0.211844 | \n",
+ " -0.045951 | \n",
+ " -0.012506 | \n",
+ " -0.169497 | \n",
+ " -0.169915 | \n",
+ " -0.088829 | \n",
+ " 0.338914 | \n",
+ " -0.102962 | \n",
+ " -0.156583 | \n",
+ " 1.493696 | \n",
+ " 2.259007 | \n",
+ " -0.161916 | \n",
+ " -0.001087 | \n",
+ " 0.972243 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 336 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "zipcode_embeddings"
+ }
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Join PDFM embeddings and Groud Truth (PM2.5 data)"
+ ],
+ "metadata": {
+ "id": "L7XAkE0fecVU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n",
+ "data.head()"
+ ],
+ "metadata": {
+ "id": "KjwmUj5SH9-P",
+ "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 429
+ }
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " zcta pm25 state county ... feature326 feature327 feature328 feature329\n",
+ "zip/10001 10001 13.636975 NY New York County ... 1.581655 -0.015907 -0.0 3.977647\n",
+ "zip/10002 10002 12.896079 NY New York County ... 5.388407 -0.112462 -0.0 4.431747\n",
+ "zip/10003 10003 12.829412 NY New York County ... 4.039156 -0.156848 -0.0 5.094444\n",
+ "zip/10004 10004 13.851765 NY New York County ... 6.799802 -0.078682 -0.0 4.140815\n",
+ "zip/10005 10005 13.730000 NY New York County ... 7.295258 -0.169108 -0.0 3.934241\n",
+ "\n",
+ "[5 rows x 338 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " zcta | \n",
+ " pm25 | \n",
+ " state | \n",
+ " county | \n",
+ " city | \n",
+ " population | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " feature0 | \n",
+ " feature1 | \n",
+ " feature2 | \n",
+ " feature3 | \n",
+ " feature4 | \n",
+ " feature5 | \n",
+ " feature6 | \n",
+ " feature7 | \n",
+ " feature8 | \n",
+ " feature9 | \n",
+ " feature10 | \n",
+ " feature11 | \n",
+ " feature12 | \n",
+ " feature13 | \n",
+ " feature14 | \n",
+ " feature15 | \n",
+ " feature16 | \n",
+ " feature17 | \n",
+ " feature18 | \n",
+ " feature19 | \n",
+ " feature20 | \n",
+ " feature21 | \n",
+ " feature22 | \n",
+ " feature23 | \n",
+ " feature24 | \n",
+ " feature25 | \n",
+ " feature26 | \n",
+ " feature27 | \n",
+ " feature28 | \n",
+ " feature29 | \n",
+ " feature30 | \n",
+ " feature31 | \n",
+ " feature32 | \n",
+ " feature33 | \n",
+ " feature34 | \n",
+ " feature35 | \n",
+ " feature36 | \n",
+ " feature37 | \n",
+ " feature38 | \n",
+ " feature39 | \n",
+ " feature40 | \n",
+ " feature41 | \n",
+ " ... | \n",
+ " feature280 | \n",
+ " feature281 | \n",
+ " feature282 | \n",
+ " feature283 | \n",
+ " feature284 | \n",
+ " feature285 | \n",
+ " feature286 | \n",
+ " feature287 | \n",
+ " feature288 | \n",
+ " feature289 | \n",
+ " feature290 | \n",
+ " feature291 | \n",
+ " feature292 | \n",
+ " feature293 | \n",
+ " feature294 | \n",
+ " feature295 | \n",
+ " feature296 | \n",
+ " feature297 | \n",
+ " feature298 | \n",
+ " feature299 | \n",
+ " feature300 | \n",
+ " feature301 | \n",
+ " feature302 | \n",
+ " feature303 | \n",
+ " feature304 | \n",
+ " feature305 | \n",
+ " feature306 | \n",
+ " feature307 | \n",
+ " feature308 | \n",
+ " feature309 | \n",
+ " feature310 | \n",
+ " feature311 | \n",
+ " feature312 | \n",
+ " feature313 | \n",
+ " feature314 | \n",
+ " feature315 | \n",
+ " feature316 | \n",
+ " feature317 | \n",
+ " feature318 | \n",
+ " feature319 | \n",
+ " feature320 | \n",
+ " feature321 | \n",
+ " feature322 | \n",
+ " feature323 | \n",
+ " feature324 | \n",
+ " feature325 | \n",
+ " feature326 | \n",
+ " feature327 | \n",
+ " feature328 | \n",
+ " feature329 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | zip/10001 | \n",
+ " 10001 | \n",
+ " 13.636975 | \n",
+ " NY | \n",
+ " New York County | \n",
+ " New York | \n",
+ " 26966 | \n",
+ " 40.750672 | \n",
+ " -73.997281 | \n",
+ " -0.073025 | \n",
+ " 0.621611 | \n",
+ " 3.204059 | \n",
+ " 1.219250 | \n",
+ " 0.086666 | \n",
+ " 0.422733 | \n",
+ " 1.362252 | \n",
+ " 0.448227 | \n",
+ " 0.780789 | \n",
+ " 0.076176 | \n",
+ " 0.744889 | \n",
+ " 0.206153 | \n",
+ " 1.486222 | \n",
+ " 2.341626 | \n",
+ " 2.934966 | \n",
+ " 3.197650 | \n",
+ " 2.227243 | \n",
+ " 0.536704 | \n",
+ " 1.641904 | \n",
+ " 0.215733 | \n",
+ " 0.084727 | \n",
+ " 0.164679 | \n",
+ " 4.511102 | \n",
+ " 2.252191 | \n",
+ " -0.024315 | \n",
+ " 0.212265 | \n",
+ " 3.857031 | \n",
+ " 2.217771 | \n",
+ " 0.308766 | \n",
+ " 4.034735 | \n",
+ " 1.695519 | \n",
+ " 0.567224 | \n",
+ " -0.077346 | \n",
+ " 3.953824 | \n",
+ " 1.425099 | \n",
+ " 2.275840 | \n",
+ " 1.042735 | \n",
+ " 1.490175 | \n",
+ " 1.557218 | \n",
+ " 0.206169 | \n",
+ " 0.049573 | \n",
+ " 2.313412 | \n",
+ " ... | \n",
+ " 1.501178 | \n",
+ " -0.123201 | \n",
+ " -0.167260 | \n",
+ " -0.159472 | \n",
+ " -0.001858 | \n",
+ " -0.129980 | \n",
+ " 0.175728 | \n",
+ " -1.200000e-07 | \n",
+ " -0.008414 | \n",
+ " -0.155363 | \n",
+ " 0.315214 | \n",
+ " -0.009080 | \n",
+ " -0.0 | \n",
+ " -0.000000e+00 | \n",
+ " 0.770695 | \n",
+ " -0.152233 | \n",
+ " -0.049659 | \n",
+ " 8.209490 | \n",
+ " 0.700820 | \n",
+ " 1.000827 | \n",
+ " 5.112181 | \n",
+ " -0.146953 | \n",
+ " 1.654920 | \n",
+ " -0.002328 | \n",
+ " -0.004713 | \n",
+ " -0.162020 | \n",
+ " 9.814237 | \n",
+ " 2.872081 | \n",
+ " 7.905857 | \n",
+ " 9.632621 | \n",
+ " 7.280638 | \n",
+ " -0.017287 | \n",
+ " 0.479747 | \n",
+ " 1.191547 | \n",
+ " 6.030477 | \n",
+ " -0.028483 | \n",
+ " -0.002579 | \n",
+ " -0.150458 | \n",
+ " -0.145124 | \n",
+ " -0.000732 | \n",
+ " -0.006855 | \n",
+ " 2.571859 | \n",
+ " -0.006076 | \n",
+ " 4.313338 | \n",
+ " -0.105290 | \n",
+ " -9.500000e-06 | \n",
+ " 1.581655 | \n",
+ " -0.015907 | \n",
+ " -0.0 | \n",
+ " 3.977647 | \n",
+ "
\n",
+ " \n",
+ " | zip/10002 | \n",
+ " 10002 | \n",
+ " 12.896079 | \n",
+ " NY | \n",
+ " New York County | \n",
+ " New York | \n",
+ " 76807 | \n",
+ " 40.715762 | \n",
+ " -73.986258 | \n",
+ " -0.144769 | \n",
+ " 1.102272 | \n",
+ " 3.980774 | \n",
+ " 1.751827 | \n",
+ " 0.082982 | \n",
+ " 0.474515 | \n",
+ " 0.949866 | \n",
+ " 0.379872 | \n",
+ " 1.394939 | \n",
+ " 0.239189 | \n",
+ " 0.831812 | \n",
+ " 0.231631 | \n",
+ " 1.985357 | \n",
+ " 2.112055 | \n",
+ " 3.209443 | \n",
+ " 3.651924 | \n",
+ " 1.369114 | \n",
+ " 0.449409 | \n",
+ " 2.278484 | \n",
+ " 0.187911 | \n",
+ " 0.342743 | \n",
+ " 0.178030 | \n",
+ " 5.858740 | \n",
+ " 2.435344 | \n",
+ " -0.011125 | \n",
+ " 0.203172 | \n",
+ " 3.586207 | \n",
+ " 2.016004 | \n",
+ " 0.292903 | \n",
+ " 4.168702 | \n",
+ " 2.146975 | \n",
+ " 1.026136 | \n",
+ " -0.085150 | \n",
+ " 4.931133 | \n",
+ " 1.664730 | \n",
+ " 2.572370 | \n",
+ " 0.716705 | \n",
+ " 2.225487 | \n",
+ " 2.063185 | \n",
+ " 0.276354 | \n",
+ " 0.272357 | \n",
+ " 3.370650 | \n",
+ " ... | \n",
+ " 0.361146 | \n",
+ " -0.096761 | \n",
+ " -0.146408 | \n",
+ " -0.155304 | \n",
+ " -0.074113 | \n",
+ " -0.015054 | \n",
+ " 1.665584 | \n",
+ " -1.102000e-05 | \n",
+ " -0.002935 | \n",
+ " -0.169603 | \n",
+ " 0.145326 | \n",
+ " -0.025443 | \n",
+ " -0.0 | \n",
+ " -5.000000e-08 | \n",
+ " -0.161510 | \n",
+ " -0.153942 | \n",
+ " -0.166410 | \n",
+ " 6.376185 | \n",
+ " 1.163428 | \n",
+ " 1.138928 | \n",
+ " 5.270078 | \n",
+ " -0.159924 | \n",
+ " 1.996964 | \n",
+ " -0.000041 | \n",
+ " -0.020128 | \n",
+ " -0.050640 | \n",
+ " 8.771538 | \n",
+ " 2.086100 | \n",
+ " 8.602596 | \n",
+ " 8.060193 | \n",
+ " 8.714463 | \n",
+ " -0.004688 | \n",
+ " 0.136199 | \n",
+ " 0.513982 | \n",
+ " 5.643821 | \n",
+ " -0.002350 | \n",
+ " -0.008118 | \n",
+ " -0.089103 | \n",
+ " -0.156498 | \n",
+ " -0.034799 | \n",
+ " -0.103022 | \n",
+ " 2.710451 | \n",
+ " -0.000021 | \n",
+ " 4.655519 | \n",
+ " -0.159014 | \n",
+ " -0.000000e+00 | \n",
+ " 5.388407 | \n",
+ " -0.112462 | \n",
+ " -0.0 | \n",
+ " 4.431747 | \n",
+ "
\n",
+ " \n",
+ " | zip/10003 | \n",
+ " 10003 | \n",
+ " 12.829412 | \n",
+ " NY | \n",
+ " New York County | \n",
+ " New York | \n",
+ " 54447 | \n",
+ " 40.731829 | \n",
+ " -73.989181 | \n",
+ " -0.131461 | \n",
+ " 1.132039 | \n",
+ " 3.512190 | \n",
+ " 1.176800 | \n",
+ " 0.161716 | \n",
+ " 0.459370 | \n",
+ " 1.291074 | \n",
+ " 0.475496 | \n",
+ " 1.146470 | \n",
+ " 0.199002 | \n",
+ " 0.968685 | \n",
+ " 0.249058 | \n",
+ " 1.788139 | \n",
+ " 2.295915 | \n",
+ " 3.039128 | \n",
+ " 3.726780 | \n",
+ " 1.822461 | \n",
+ " 0.546915 | \n",
+ " 1.955472 | \n",
+ " 0.188880 | \n",
+ " 0.198170 | \n",
+ " 0.138593 | \n",
+ " 5.482426 | \n",
+ " 2.644991 | \n",
+ " -0.030993 | \n",
+ " 0.256750 | \n",
+ " 3.747168 | \n",
+ " 2.160955 | \n",
+ " 0.311789 | \n",
+ " 4.131626 | \n",
+ " 2.050255 | \n",
+ " 0.924002 | \n",
+ " -0.077789 | \n",
+ " 4.401683 | \n",
+ " 1.490332 | \n",
+ " 2.379128 | \n",
+ " 0.871206 | \n",
+ " 2.151970 | \n",
+ " 1.929199 | \n",
+ " 0.361298 | \n",
+ " 0.229650 | \n",
+ " 3.038753 | \n",
+ " ... | \n",
+ " 0.461322 | \n",
+ " -0.141985 | \n",
+ " -0.085863 | \n",
+ " -0.156253 | \n",
+ " -0.005502 | \n",
+ " -0.046846 | \n",
+ " 1.596130 | \n",
+ " -4.810000e-06 | \n",
+ " -0.089342 | \n",
+ " -0.099540 | \n",
+ " 0.272569 | \n",
+ " -0.009312 | \n",
+ " -0.0 | \n",
+ " -0.000000e+00 | \n",
+ " 0.092946 | \n",
+ " -0.169363 | \n",
+ " -0.042557 | \n",
+ " 6.376621 | \n",
+ " 0.606484 | \n",
+ " 0.778049 | \n",
+ " 5.227136 | \n",
+ " -0.169888 | \n",
+ " 2.206395 | \n",
+ " -0.000619 | \n",
+ " -0.166381 | \n",
+ " -0.167345 | \n",
+ " 8.366828 | \n",
+ " 2.290446 | \n",
+ " 7.762557 | \n",
+ " 7.947579 | \n",
+ " 6.410265 | \n",
+ " -0.020947 | \n",
+ " 0.935508 | \n",
+ " 0.415192 | \n",
+ " 5.355077 | \n",
+ " -0.027503 | \n",
+ " -0.000485 | \n",
+ " -0.123136 | \n",
+ " 0.030198 | \n",
+ " -0.001412 | \n",
+ " -0.004739 | \n",
+ " 2.824203 | \n",
+ " -0.000298 | \n",
+ " 4.061482 | \n",
+ " -0.068253 | \n",
+ " -1.350000e-06 | \n",
+ " 4.039156 | \n",
+ " -0.156848 | \n",
+ " -0.0 | \n",
+ " 5.094444 | \n",
+ "
\n",
+ " \n",
+ " | zip/10004 | \n",
+ " 10004 | \n",
+ " 13.851765 | \n",
+ " NY | \n",
+ " New York County | \n",
+ " New York | \n",
+ " 4795 | \n",
+ " 40.688630 | \n",
+ " -74.018244 | \n",
+ " -0.147625 | \n",
+ " 0.546787 | \n",
+ " 3.229492 | \n",
+ " 1.886864 | \n",
+ " 0.108621 | \n",
+ " 0.603010 | \n",
+ " 1.442713 | \n",
+ " 0.471150 | \n",
+ " 1.028882 | \n",
+ " 0.411730 | \n",
+ " 0.708382 | \n",
+ " 0.346837 | \n",
+ " 1.798658 | \n",
+ " 1.940426 | \n",
+ " 3.435431 | \n",
+ " 2.693004 | \n",
+ " 1.956380 | \n",
+ " 0.300429 | \n",
+ " 2.173964 | \n",
+ " 0.440953 | \n",
+ " 0.339811 | \n",
+ " 0.275773 | \n",
+ " 5.271566 | \n",
+ " 2.132799 | \n",
+ " -0.018342 | \n",
+ " 0.398195 | \n",
+ " 3.207484 | \n",
+ " 1.488841 | \n",
+ " 0.526957 | \n",
+ " 4.330931 | \n",
+ " 1.806410 | \n",
+ " 1.111506 | \n",
+ " -0.118441 | \n",
+ " 4.121751 | \n",
+ " 1.604998 | \n",
+ " 1.996196 | \n",
+ " 0.674802 | \n",
+ " 1.756082 | \n",
+ " 2.741566 | \n",
+ " 0.211858 | \n",
+ " 0.289891 | \n",
+ " 2.604892 | \n",
+ " ... | \n",
+ " 0.393506 | \n",
+ " -0.142817 | \n",
+ " -0.135064 | \n",
+ " -0.018993 | \n",
+ " -0.057043 | \n",
+ " -0.131630 | \n",
+ " 0.758817 | \n",
+ " -0.000000e+00 | \n",
+ " -0.000007 | \n",
+ " -0.134696 | \n",
+ " 0.017068 | \n",
+ " -0.007268 | \n",
+ " -0.0 | \n",
+ " -3.080000e-06 | \n",
+ " -0.152431 | \n",
+ " -0.105453 | \n",
+ " -0.141028 | \n",
+ " 1.679086 | \n",
+ " 0.534155 | \n",
+ " 1.851612 | \n",
+ " 5.175022 | \n",
+ " 1.438296 | \n",
+ " 2.622611 | \n",
+ " -0.000090 | \n",
+ " -0.004313 | \n",
+ " -0.119687 | \n",
+ " 2.166597 | \n",
+ " 3.767899 | \n",
+ " 8.035578 | \n",
+ " 4.923587 | \n",
+ " 6.152561 | \n",
+ " -0.007616 | \n",
+ " -0.072199 | \n",
+ " 0.052521 | \n",
+ " 3.980608 | \n",
+ " -0.000069 | \n",
+ " -0.000103 | \n",
+ " -0.027072 | \n",
+ " 0.367801 | \n",
+ " -0.003337 | \n",
+ " -0.010422 | \n",
+ " 1.127201 | \n",
+ " -0.000079 | \n",
+ " 3.496159 | \n",
+ " -0.079587 | \n",
+ " -8.000000e-08 | \n",
+ " 6.799802 | \n",
+ " -0.078682 | \n",
+ " -0.0 | \n",
+ " 4.140815 | \n",
+ "
\n",
+ " \n",
+ " | zip/10005 | \n",
+ " 10005 | \n",
+ " 13.730000 | \n",
+ " NY | \n",
+ " New York County | \n",
+ " New York | \n",
+ " 8637 | \n",
+ " 40.705974 | \n",
+ " -74.008768 | \n",
+ " -0.142861 | \n",
+ " 0.778521 | \n",
+ " 3.640563 | \n",
+ " 1.774548 | \n",
+ " 0.103140 | \n",
+ " 0.462198 | \n",
+ " 1.238112 | \n",
+ " 0.362621 | \n",
+ " 1.067066 | \n",
+ " 0.240392 | \n",
+ " 1.012144 | \n",
+ " 0.450193 | \n",
+ " 2.065349 | \n",
+ " 2.076180 | \n",
+ " 3.030134 | \n",
+ " 3.465373 | \n",
+ " 1.888940 | \n",
+ " 0.374023 | \n",
+ " 2.323722 | \n",
+ " 0.307162 | \n",
+ " 0.243969 | \n",
+ " 0.205338 | \n",
+ " 5.640188 | \n",
+ " 2.588103 | \n",
+ " -0.015375 | \n",
+ " 0.324594 | \n",
+ " 3.737864 | \n",
+ " 1.677935 | \n",
+ " 0.379659 | \n",
+ " 4.378235 | \n",
+ " 2.042296 | \n",
+ " 1.018322 | \n",
+ " -0.109311 | \n",
+ " 4.715398 | \n",
+ " 1.757364 | \n",
+ " 2.500815 | \n",
+ " 0.888996 | \n",
+ " 1.959577 | \n",
+ " 2.203878 | \n",
+ " 0.227336 | \n",
+ " 0.243604 | \n",
+ " 3.124810 | \n",
+ " ... | \n",
+ " -0.117290 | \n",
+ " -0.084119 | \n",
+ " -0.169264 | \n",
+ " -0.115257 | \n",
+ " -0.010591 | \n",
+ " -0.026248 | \n",
+ " 1.397036 | \n",
+ " -0.000000e+00 | \n",
+ " -0.000242 | \n",
+ " -0.167469 | \n",
+ " 0.550521 | \n",
+ " -0.013257 | \n",
+ " -0.0 | \n",
+ " -3.500000e-07 | \n",
+ " -0.156989 | \n",
+ " 0.212693 | \n",
+ " -0.133220 | \n",
+ " 3.382343 | \n",
+ " 0.718064 | \n",
+ " 0.732648 | \n",
+ " 5.014384 | \n",
+ " 0.157389 | \n",
+ " 2.529759 | \n",
+ " -0.000866 | \n",
+ " -0.002382 | \n",
+ " -0.072132 | \n",
+ " 3.905590 | \n",
+ " 2.831104 | \n",
+ " 8.165786 | \n",
+ " 6.136455 | \n",
+ " 6.366154 | \n",
+ " -0.007492 | \n",
+ " 0.567445 | \n",
+ " -0.101746 | \n",
+ " 3.834963 | \n",
+ " -0.009529 | \n",
+ " -0.000505 | \n",
+ " 0.028645 | \n",
+ " 1.454115 | \n",
+ " -0.001952 | \n",
+ " -0.011845 | \n",
+ " 2.385040 | \n",
+ " -0.000017 | \n",
+ " 3.454340 | \n",
+ " -0.159854 | \n",
+ " -1.320000e-06 | \n",
+ " 7.295258 | \n",
+ " -0.169108 | \n",
+ " -0.0 | \n",
+ " 3.934241 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 338 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "data"
+ }
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data.shape"
+ ],
+ "metadata": {
+ "id": "UUKLoxY6JkUt",
+ "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "execution_count": 18,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(29298, 338)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "embedding_features = [f\"feature{x}\" for x in range(330)]\n",
+ "label = \"pm25\""
+ ],
+ "metadata": {
+ "id": "NvHQ97WYJmOd"
+ },
+ "execution_count": 19,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data = data.dropna(subset=[label])"
+ ],
+ "metadata": {
+ "id": "35igKORxJ15A"
+ },
+ "execution_count": 20,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Split Train and Test Data"
+ ],
+ "metadata": {
+ "id": "tuCYm8dcevqo"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "data = data[embedding_features + [label]]\n",
+ "X = data[embedding_features]\n",
+ "y = data[label]\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X, y, test_size=0.2, random_state=42\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "IgSmPhJTJ2QW"
+ },
+ "execution_count": 21,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Fit K-Nearest Neighbors Model"
+ ],
+ "metadata": {
+ "id": "iHys75z1fFFJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "k = 5\n",
+ "model = KNeighborsRegressor(n_neighbors=k)\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = model.predict(X_test)"
+ ],
+ "metadata": {
+ "id": "KCgs-cyoJ5Nm"
+ },
+ "execution_count": 22,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
+ "# Evaluate the model\n",
+ "metrics = evaluate_model(evaluation_df)\n",
+ "print(metrics)"
+ ],
+ "metadata": {
+ "id": "TkWTkqPCKEYu",
+ "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{'r2': 0.9423382724431623, 'r': 0.9708871328512507, 'rmse': 0.6494730499555952, 'mae': 0.4204252179218217, 'mape': 0.05619889943563576}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Evaluate K-Nearest Neighbors Model"
+ ],
+ "metadata": {
+ "id": "ywbC7hLSfMZB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "xy_lim = (0, 30)\n",
+ "plot_actual_vs_predicted(\n",
+ " evaluation_df,\n",
+ " xlim=xy_lim,\n",
+ " ylim=xy_lim,\n",
+ " title=\"Actual vs Predicted PM2.5\",\n",
+ " x_label=\"Actual PM2.5\",\n",
+ " y_label=\"Predicted PM2.5\",\n",
+ ")"
+ ],
+ "metadata": {
+ "id": "LZORS0JQKjQ4",
+ "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 817
+ }
+ },
+ "execution_count": 28,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "SjOVAvPYkDYx"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Fit Random Forest Regressor model"
+ ],
+ "metadata": {
+ "id": "LPXkNlenjGp_"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "model = RandomForestRegressor(n_estimators=10,verbose=10,n_jobs=-1)\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = model.predict(X_test)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "M7S4naYnjN4l",
+ "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9"
+ },
+ "execution_count": 29,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "building tree 1 of 10building tree 2 of 10\n",
+ "\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 20.0s\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "building tree 3 of 10\n",
+ "building tree 4 of 10\n",
+ "building tree 5 of 10\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Done 4 tasks | elapsed: 38.8s\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "building tree 6 of 10\n",
+ "building tree 7 of 10\n",
+ "building tree 8 of 10\n",
+ "building tree 9 of 10\n",
+ "building tree 10 of 10\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.6min finished\n",
+ "[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.\n",
+ "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 0.0s\n",
+ "[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 0.0s finished\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
+ "# Evaluate the model\n",
+ "metrics = evaluate_model(evaluation_df)\n",
+ "print(metrics)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "CaxFfeEtjRiH",
+ "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3"
+ },
+ "execution_count": 30,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{'r2': 0.9086928793143653, 'r': 0.9539294211355118, 'rmse': 0.817277670967286, 'mae': 0.568816793219495, 'mape': 0.07572915522396176}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Evaluate Random Forest Model"
+ ],
+ "metadata": {
+ "id": "v96jVyGRjWtE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "xy_lim = (0, 30)\n",
+ "plot_actual_vs_predicted(\n",
+ " evaluation_df,\n",
+ " xlim=xy_lim,\n",
+ " ylim=xy_lim,\n",
+ " title=\"Actual vs Predicted PM2.5\",\n",
+ " x_label=\"Actual PM2.5\",\n",
+ " y_label=\"Predicted PM2.5\",\n",
+ ")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 817
+ },
+ "id": "bVnYyd-zjb85",
+ "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d"
+ },
+ "execution_count": 31,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ ""
+ ]
+ },
+ "metadata": {}
+ }
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
From 62b19ac7c6e892ab52adbc6f47be7b7a5ed56114 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2025 10:34:00 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
PDFM_embeddings_to_predict_PM2_5_in_US.ipynb | 4062 +++---------------
1 file changed, 543 insertions(+), 3519 deletions(-)
diff --git a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
index ce42a4f..be872f5 100644
--- a/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
+++ b/PDFM_embeddings_to_predict_PM2_5_in_US.ipynb
@@ -1,3522 +1,546 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n",
- "\n",
- "Useful Resources:\n",
- "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n",
- "2. https://github.com/google-research/population-dynamics/tree/master/notebooks"
- ],
- "metadata": {
- "id": "tE3akitQdA-m"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Acknowledgements:\n",
- "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)"
- ],
- "metadata": {
- "id": "TbTs9lKqddKS"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "%%capture\n",
- "!pip install leafmap"
- ],
- "metadata": {
- "id": "JqjIF4kAKZGR"
- },
- "execution_count": 2,
- "outputs": []
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "id": "lIYdn1woOS1n"
- },
- "outputs": [],
- "source": [
- "#import libraries\n",
- "import pandas as pd\n",
- "import os\n",
- "from sklearn.model_selection import train_test_split\n",
- "from sklearn.linear_model import LinearRegression\n",
- "from sklearn.ensemble import RandomForestRegressor\n",
- "from sklearn.neighbors import KNeighborsRegressor\n",
- "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file"
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Get US PM2.5 data\n",
- "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about"
- ],
- "metadata": {
- "id": "0z3FKT1fgePa"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part1\")\n",
- "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part2\")\n",
- "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part3\")\n",
- "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part4\")\n",
- "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\",sheet_name=\"data_part5\")"
- ],
- "metadata": {
- "id": "8VYkB_TeA0kP"
- },
- "execution_count": 4,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Process PM2.5 data"
- ],
- "metadata": {
- "id": "GYC_GWkdgodj"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df = pd.concat([df0,df1,df2,df3,df4],ignore_index=True)\n",
- "df.head()"
- ],
- "metadata": {
- "id": "QE3MZiRYA6q1",
- "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- }
- },
- "execution_count": 5,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " Unnamed: 0 year ZIP zcta popdensity ... asian_pop white_pop medhouseholdincome pm25 urban\n",
- "0 1 2000 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
- "1 2 2001 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
- "2 3 2002 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
- "3 4 2003 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
- "4 5 2004 1 NaN NaN ... NaN NaN NaN NaN NaN\n",
- "\n",
- "[5 rows x 21 columns]"
- ],
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Unnamed: 0 | \n",
- " year | \n",
- " ZIP | \n",
- " zcta | \n",
- " popdensity | \n",
- " population | \n",
- " poverty | \n",
- " education | \n",
- " pct_blk | \n",
- " pct_hisp | \n",
- " pct_native | \n",
- " pct_asian | \n",
- " pct_white | \n",
- " black_pop | \n",
- " hisp_pop | \n",
- " native_pop | \n",
- " asian_pop | \n",
- " white_pop | \n",
- " medhouseholdincome | \n",
- " pm25 | \n",
- " urban | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " 2000 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 2 | \n",
- " 2001 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 3 | \n",
- " 2002 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 4 | \n",
- " 2003 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 5 | \n",
- " 2004 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- "
\n"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "df"
- }
- },
- "metadata": {},
- "execution_count": 5
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "df.shape"
- ],
- "metadata": {
- "id": "p9NCYMyxGBQB",
- "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e",
- "colab": {
- "base_uri": "https://localhost:8080/"
- }
- },
- "execution_count": 6,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "(789260, 21)"
- ]
- },
- "metadata": {},
- "execution_count": 6
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "df[\"zcta\"].nunique()"
- ],
- "metadata": {
- "id": "ip7NBwl-EDsf",
- "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221",
- "colab": {
- "base_uri": "https://localhost:8080/"
- }
- },
- "execution_count": 7,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "32406"
- ]
- },
- "metadata": {},
- "execution_count": 7
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n",
- "pm25_df.head()"
- ],
- "metadata": {
- "id": "8qtI9VgiCcvj",
- "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 272
- }
- },
- "execution_count": 8,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "zcta\n",
- "601.0 NaN\n",
- "602.0 NaN\n",
- "603.0 NaN\n",
- "606.0 NaN\n",
- "610.0 NaN\n",
- "Name: pm25, dtype: float64"
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " pm25 | \n",
- "
\n",
- " \n",
- " | zcta | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 601.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 602.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 603.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 606.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 610.0 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "metadata": {},
- "execution_count": 8
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "pm25_df.dropna(axis=0,inplace=True)\n",
- "pm25_df.head()"
- ],
- "metadata": {
- "id": "6-qSy8lVDDoM",
- "outputId": "1f086442-221a-4f64-ee9d-db456bb21808",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 272
- }
- },
- "execution_count": 9,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "zcta\n",
- "1001.0 9.398180\n",
- "1002.0 8.026795\n",
- "1003.0 8.949020\n",
- "1005.0 6.409811\n",
- "1007.0 7.375929\n",
- "Name: pm25, dtype: float64"
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " pm25 | \n",
- "
\n",
- " \n",
- " | zcta | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 1001.0 | \n",
- " 9.398180 | \n",
- "
\n",
- " \n",
- " | 1002.0 | \n",
- " 8.026795 | \n",
- "
\n",
- " \n",
- " | 1003.0 | \n",
- " 8.949020 | \n",
- "
\n",
- " \n",
- " | 1005.0 | \n",
- " 6.409811 | \n",
- "
\n",
- " \n",
- " | 1007.0 | \n",
- " 7.375929 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "metadata": {},
- "execution_count": 9
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "pm25_df.index = pm25_df.index.astype(int)\n",
- "print(pm25_df.shape)\n",
- "pm25_df.head()"
- ],
- "metadata": {
- "id": "5kAxQ3uVGswD",
- "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 290
- }
- },
- "execution_count": 10,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "(31956,)\n"
- ]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "zcta\n",
- "1001 9.398180\n",
- "1002 8.026795\n",
- "1003 8.949020\n",
- "1005 6.409811\n",
- "1007 7.375929\n",
- "Name: pm25, dtype: float64"
- ],
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " pm25 | \n",
- "
\n",
- " \n",
- " | zcta | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 1001 | \n",
- " 9.398180 | \n",
- "
\n",
- " \n",
- " | 1002 | \n",
- " 8.026795 | \n",
- "
\n",
- " \n",
- " | 1003 | \n",
- " 8.949020 | \n",
- "
\n",
- " \n",
- " | 1005 | \n",
- " 6.409811 | \n",
- "
\n",
- " \n",
- " | 1007 | \n",
- " 7.375929 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ]
- },
- "metadata": {},
- "execution_count": 10
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n",
- "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n",
- "pm25_df.head()"
- ],
- "metadata": {
- "id": "dBOUhYaOIU0-",
- "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 238
- }
- },
- "execution_count": 11,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " zcta pm25\n",
- "zcta \n",
- "zip/1001 1001 9.398180\n",
- "zip/1002 1002 8.026795\n",
- "zip/1003 1003 8.949020\n",
- "zip/1005 1005 6.409811\n",
- "zip/1007 1007 7.375929"
- ],
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " zcta | \n",
- " pm25 | \n",
- "
\n",
- " \n",
- " | zcta | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | zip/1001 | \n",
- " 1001 | \n",
- " 9.398180 | \n",
- "
\n",
- " \n",
- " | zip/1002 | \n",
- " 1002 | \n",
- " 8.026795 | \n",
- "
\n",
- " \n",
- " | zip/1003 | \n",
- " 1003 | \n",
- " 8.949020 | \n",
- "
\n",
- " \n",
- " | zip/1005 | \n",
- " 1005 | \n",
- " 6.409811 | \n",
- "
\n",
- " \n",
- " | zip/1007 | \n",
- " 1007 | \n",
- " 7.375929 | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- "
\n"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "pm25_df",
- "repr_error": "cannot insert zcta, already exists"
- }
- },
- "metadata": {},
- "execution_count": 11
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Request access to PDFM Embeddings"
- ],
- "metadata": {
- "id": "0CUwjIxaeWCm"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "!unzip /content/pdfm_embeddings.zip"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "yxEkU1GjibgC",
- "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218"
- },
- "execution_count": 14,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Archive: /content/pdfm_embeddings.zip\n",
- " creating: pdfm_embeddings/\n",
- " creating: pdfm_embeddings/v0/\n",
- " creating: pdfm_embeddings/v0/us/\n",
- " inflating: pdfm_embeddings/v0/us/county.geojson \n",
- " inflating: pdfm_embeddings/v0/us/county_embeddings.csv \n",
- " inflating: pdfm_embeddings/v0/us/zcta.geojson \n",
- " inflating: pdfm_embeddings/v0/us/zcta_embeddings.csv \n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\""
- ],
- "metadata": {
- "id": "-DugeFaSG2Pi"
- },
- "execution_count": 12,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "if not os.path.exists(embeddings_file_path):\n",
- " raise FileNotFoundError(\"Please request the embeddings from Google\")"
- ],
- "metadata": {
- "id": "JZZX7tzlHYa9"
- },
- "execution_count": 15,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n",
- "zipcode_embeddings.head()"
- ],
- "metadata": {
- "id": "cWNluHJYHY57",
- "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 374
- }
- },
- "execution_count": 16,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " state county city ... feature327 feature328 feature329\n",
- "place ... \n",
- "zip/97910 OR Malheur County Jordan Valley ... -0.001661 -0.001010 4.495589\n",
- "zip/89412 NV Washoe County Gerlach ... -0.024385 -0.000295 3.399393\n",
- "zip/88030 NM Luna County Deming ... -0.116499 -0.051163 3.866543\n",
- "zip/82633 WY Converse County Douglas ... -0.047864 -0.000042 7.453567\n",
- "zip/59538 MT Phillips County Malta ... -0.161916 -0.001087 0.972243\n",
- "\n",
- "[5 rows x 336 columns]"
- ],
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " state | \n",
- " county | \n",
- " city | \n",
- " population | \n",
- " latitude | \n",
- " longitude | \n",
- " feature0 | \n",
- " feature1 | \n",
- " feature2 | \n",
- " feature3 | \n",
- " feature4 | \n",
- " feature5 | \n",
- " feature6 | \n",
- " feature7 | \n",
- " feature8 | \n",
- " feature9 | \n",
- " feature10 | \n",
- " feature11 | \n",
- " feature12 | \n",
- " feature13 | \n",
- " feature14 | \n",
- " feature15 | \n",
- " feature16 | \n",
- " feature17 | \n",
- " feature18 | \n",
- " feature19 | \n",
- " feature20 | \n",
- " feature21 | \n",
- " feature22 | \n",
- " feature23 | \n",
- " feature24 | \n",
- " feature25 | \n",
- " feature26 | \n",
- " feature27 | \n",
- " feature28 | \n",
- " feature29 | \n",
- " feature30 | \n",
- " feature31 | \n",
- " feature32 | \n",
- " feature33 | \n",
- " feature34 | \n",
- " feature35 | \n",
- " feature36 | \n",
- " feature37 | \n",
- " feature38 | \n",
- " feature39 | \n",
- " feature40 | \n",
- " feature41 | \n",
- " feature42 | \n",
- " feature43 | \n",
- " ... | \n",
- " feature280 | \n",
- " feature281 | \n",
- " feature282 | \n",
- " feature283 | \n",
- " feature284 | \n",
- " feature285 | \n",
- " feature286 | \n",
- " feature287 | \n",
- " feature288 | \n",
- " feature289 | \n",
- " feature290 | \n",
- " feature291 | \n",
- " feature292 | \n",
- " feature293 | \n",
- " feature294 | \n",
- " feature295 | \n",
- " feature296 | \n",
- " feature297 | \n",
- " feature298 | \n",
- " feature299 | \n",
- " feature300 | \n",
- " feature301 | \n",
- " feature302 | \n",
- " feature303 | \n",
- " feature304 | \n",
- " feature305 | \n",
- " feature306 | \n",
- " feature307 | \n",
- " feature308 | \n",
- " feature309 | \n",
- " feature310 | \n",
- " feature311 | \n",
- " feature312 | \n",
- " feature313 | \n",
- " feature314 | \n",
- " feature315 | \n",
- " feature316 | \n",
- " feature317 | \n",
- " feature318 | \n",
- " feature319 | \n",
- " feature320 | \n",
- " feature321 | \n",
- " feature322 | \n",
- " feature323 | \n",
- " feature324 | \n",
- " feature325 | \n",
- " feature326 | \n",
- " feature327 | \n",
- " feature328 | \n",
- " feature329 | \n",
- "
\n",
- " \n",
- " | place | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | zip/97910 | \n",
- " OR | \n",
- " Malheur County | \n",
- " Jordan Valley | \n",
- " 609 | \n",
- " 42.749076 | \n",
- " -117.511459 | \n",
- " -0.138227 | \n",
- " 1.120377 | \n",
- " 0.072900 | \n",
- " 0.297442 | \n",
- " 0.772673 | \n",
- " 0.286467 | \n",
- " 0.802398 | \n",
- " 0.620847 | \n",
- " 0.060810 | \n",
- " 0.125926 | \n",
- " 0.452905 | \n",
- " 0.317210 | \n",
- " 1.560488 | \n",
- " 0.173717 | \n",
- " 0.338584 | \n",
- " -0.011876 | \n",
- " 0.369918 | \n",
- " 0.734241 | \n",
- " -0.023161 | \n",
- " 0.927918 | \n",
- " 0.131129 | \n",
- " 0.174915 | \n",
- " 0.186962 | \n",
- " 0.777327 | \n",
- " -0.125255 | \n",
- " 0.252997 | \n",
- " 0.126703 | \n",
- " 0.282713 | \n",
- " 0.286217 | \n",
- " 0.108222 | \n",
- " 0.138043 | \n",
- " 1.893893 | \n",
- " 0.034341 | \n",
- " 0.116197 | \n",
- " 1.578618 | \n",
- " 0.594598 | \n",
- " 0.083396 | \n",
- " 0.217705 | \n",
- " 0.085833 | \n",
- " 0.050959 | \n",
- " 0.071389 | \n",
- " 0.046794 | \n",
- " 0.900741 | \n",
- " 0.089737 | \n",
- " ... | \n",
- " 0.795463 | \n",
- " 4.048826 | \n",
- " 4.411071 | \n",
- " -0.072585 | \n",
- " 1.172804 | \n",
- " 2.780721 | \n",
- " -0.007037 | \n",
- " -0.167074 | \n",
- " -0.169071 | \n",
- " 2.451642 | \n",
- " 4.705745 | \n",
- " 2.861044 | \n",
- " 0.964761 | \n",
- " 5.425952 | \n",
- " -0.086446 | \n",
- " -0.004044 | \n",
- " 1.692680 | \n",
- " -0.129304 | \n",
- " 1.262767 | \n",
- " 0.584393 | \n",
- " 4.957980 | \n",
- " -0.113619 | \n",
- " 3.977844 | \n",
- " -0.056266 | \n",
- " 0.154679 | \n",
- " 6.833614 | \n",
- " -0.168595 | \n",
- " 3.852105 | \n",
- " -0.008600 | \n",
- " 0.168367 | \n",
- " 2.672679 | \n",
- " 6.938071 | \n",
- " 1.462526 | \n",
- " 4.700379 | \n",
- " 3.523755 | \n",
- " -0.169971 | \n",
- " 0.279797 | \n",
- " -0.030630 | \n",
- " -0.000014 | \n",
- " 4.489360 | \n",
- " -0.158891 | \n",
- " -0.168708 | \n",
- " 1.231994 | \n",
- " -0.155765 | \n",
- " 3.043214 | \n",
- " -0.169749 | \n",
- " 0.177463 | \n",
- " -0.001661 | \n",
- " -0.001010 | \n",
- " 4.495589 | \n",
- "
\n",
- " \n",
- " | zip/89412 | \n",
- " NV | \n",
- " Washoe County | \n",
- " Gerlach | \n",
- " 98 | \n",
- " 41.102934 | \n",
- " -119.695361 | \n",
- " -0.141379 | \n",
- " 1.422782 | \n",
- " 0.234269 | \n",
- " 0.159156 | \n",
- " 0.890241 | \n",
- " 0.215427 | \n",
- " 0.533200 | \n",
- " 1.125830 | \n",
- " 0.159891 | \n",
- " 0.305449 | \n",
- " 0.673448 | \n",
- " 0.222371 | \n",
- " 1.113196 | \n",
- " 0.147047 | \n",
- " 0.270858 | \n",
- " 0.140051 | \n",
- " 0.591381 | \n",
- " 1.321858 | \n",
- " 0.182956 | \n",
- " 1.600126 | \n",
- " 0.149480 | \n",
- " 1.239454 | \n",
- " 0.326186 | \n",
- " 0.724274 | \n",
- " -0.057049 | \n",
- " 0.327722 | \n",
- " 0.225044 | \n",
- " 0.092865 | \n",
- " 0.636425 | \n",
- " 0.541034 | \n",
- " 0.168866 | \n",
- " 1.489893 | \n",
- " -0.003255 | \n",
- " 0.524351 | \n",
- " 1.672642 | \n",
- " 0.421262 | \n",
- " 0.701539 | \n",
- " 0.231094 | \n",
- " 0.223512 | \n",
- " 0.134480 | \n",
- " 0.479852 | \n",
- " 0.204405 | \n",
- " 1.043766 | \n",
- " 0.396896 | \n",
- " ... | \n",
- " -0.080731 | \n",
- " 5.357723 | \n",
- " 3.973490 | \n",
- " -0.100555 | \n",
- " 2.942436 | \n",
- " 4.120401 | \n",
- " -0.027611 | \n",
- " -0.084821 | \n",
- " -0.000068 | \n",
- " 1.364740 | \n",
- " 6.085680 | \n",
- " 5.550473 | \n",
- " -0.057929 | \n",
- " 7.677364 | \n",
- " -0.111488 | \n",
- " 0.442242 | \n",
- " 2.686013 | \n",
- " -0.008651 | \n",
- " -0.003287 | \n",
- " 4.685384 | \n",
- " 6.117020 | \n",
- " -0.083324 | \n",
- " 4.441121 | \n",
- " -0.002397 | \n",
- " -0.119943 | \n",
- " 4.539424 | \n",
- " -0.006676 | \n",
- " 4.993636 | \n",
- " -0.118508 | \n",
- " -0.169039 | \n",
- " 1.864953 | \n",
- " 4.146715 | \n",
- " -0.118220 | \n",
- " 3.688882 | \n",
- " 4.046134 | \n",
- " -0.045537 | \n",
- " 1.627209 | \n",
- " -0.012242 | \n",
- " -0.016643 | \n",
- " 4.668972 | \n",
- " -0.157417 | \n",
- " -0.043606 | \n",
- " 2.788701 | \n",
- " -0.062547 | \n",
- " 3.700745 | \n",
- " -0.169827 | \n",
- " -0.137990 | \n",
- " -0.024385 | \n",
- " -0.000295 | \n",
- " 3.399393 | \n",
- "
\n",
- " \n",
- " | zip/88030 | \n",
- " NM | \n",
- " Luna County | \n",
- " Deming | \n",
- " 24139 | \n",
- " 32.191634 | \n",
- " -107.729431 | \n",
- " -0.046666 | \n",
- " 1.414424 | \n",
- " 0.146803 | \n",
- " 1.113256 | \n",
- " 1.119576 | \n",
- " 1.093199 | \n",
- " 0.960636 | \n",
- " 0.179642 | \n",
- " 0.729488 | \n",
- " 2.447439 | \n",
- " 2.274204 | \n",
- " 2.765325 | \n",
- " 0.903284 | \n",
- " 0.520162 | \n",
- " 2.604348 | \n",
- " 0.688520 | \n",
- " 0.164436 | \n",
- " 2.755828 | \n",
- " 1.312246 | \n",
- " 0.452269 | \n",
- " 0.612660 | \n",
- " 1.135295 | \n",
- " 1.440466 | \n",
- " 0.507069 | \n",
- " -0.140809 | \n",
- " 0.481306 | \n",
- " 1.068717 | \n",
- " 2.188697 | \n",
- " 0.254398 | \n",
- " 1.019234 | \n",
- " 0.277727 | \n",
- " 0.716491 | \n",
- " 0.861136 | \n",
- " 1.232256 | \n",
- " 0.210326 | \n",
- " 0.694031 | \n",
- " 1.504018 | \n",
- " 1.430361 | \n",
- " 0.842551 | \n",
- " 0.222043 | \n",
- " 1.114556 | \n",
- " 0.856425 | \n",
- " 1.518791 | \n",
- " 1.487212 | \n",
- " ... | \n",
- " 0.472449 | \n",
- " 4.089555 | \n",
- " 1.347347 | \n",
- " 0.128349 | \n",
- " 2.517892 | \n",
- " 0.718034 | \n",
- " 0.546053 | \n",
- " 2.494347 | \n",
- " -0.024888 | \n",
- " 3.658147 | \n",
- " 5.096304 | \n",
- " 3.687950 | \n",
- " -0.162521 | \n",
- " 4.844636 | \n",
- " 0.789083 | \n",
- " 3.025670 | \n",
- " 0.667166 | \n",
- " -0.169164 | \n",
- " -0.109891 | \n",
- " 3.095128 | \n",
- " 4.816823 | \n",
- " -0.169883 | \n",
- " 4.487709 | \n",
- " -0.058803 | \n",
- " -0.167333 | \n",
- " 2.843048 | \n",
- " -0.060544 | \n",
- " 3.279263 | \n",
- " -0.158699 | \n",
- " 1.535189 | \n",
- " 2.787231 | \n",
- " 3.861916 | \n",
- " 1.569119 | \n",
- " 3.487299 | \n",
- " 2.334693 | \n",
- " 0.068785 | \n",
- " -0.162307 | \n",
- " -0.053849 | \n",
- " 0.160504 | \n",
- " 1.895565 | \n",
- " -0.000654 | \n",
- " 0.437475 | \n",
- " 4.229295 | \n",
- " 0.229199 | \n",
- " 2.098469 | \n",
- " 1.150497 | \n",
- " 0.716122 | \n",
- " -0.116499 | \n",
- " -0.051163 | \n",
- " 3.866543 | \n",
- "
\n",
- " \n",
- " | zip/82633 | \n",
- " WY | \n",
- " Converse County | \n",
- " Douglas | \n",
- " 9478 | \n",
- " 43.022270 | \n",
- " -105.410250 | \n",
- " -0.090293 | \n",
- " 1.266280 | \n",
- " 0.447868 | \n",
- " 0.781861 | \n",
- " 1.731813 | \n",
- " 0.602722 | \n",
- " 0.737066 | \n",
- " 0.743392 | \n",
- " 0.823658 | \n",
- " 1.475200 | \n",
- " 1.639734 | \n",
- " 0.202340 | \n",
- " 0.545946 | \n",
- " 0.486171 | \n",
- " 0.425758 | \n",
- " 0.951557 | \n",
- " 0.448131 | \n",
- " 0.889409 | \n",
- " 1.116265 | \n",
- " 0.331308 | \n",
- " 0.694245 | \n",
- " 0.774092 | \n",
- " 0.893476 | \n",
- " 2.088896 | \n",
- " -0.012767 | \n",
- " 1.379420 | \n",
- " 0.541944 | \n",
- " 0.903094 | \n",
- " 1.245158 | \n",
- " 0.514747 | \n",
- " 0.240520 | \n",
- " 1.853385 | \n",
- " 0.691478 | \n",
- " 1.095086 | \n",
- " 0.102779 | \n",
- " 0.976397 | \n",
- " 1.508152 | \n",
- " 1.098709 | \n",
- " 0.658931 | \n",
- " 1.544933 | \n",
- " 3.267990 | \n",
- " 1.033022 | \n",
- " 0.948243 | \n",
- " 0.768377 | \n",
- " ... | \n",
- " 4.709711 | \n",
- " 2.619931 | \n",
- " 0.377791 | \n",
- " -0.136090 | \n",
- " 1.546929 | \n",
- " 1.914665 | \n",
- " -0.038279 | \n",
- " -0.158291 | \n",
- " 3.846224 | \n",
- " 1.600872 | \n",
- " 2.556240 | \n",
- " 5.028241 | \n",
- " 3.131569 | \n",
- " 1.885251 | \n",
- " 1.723152 | \n",
- " 3.287659 | \n",
- " 0.592335 | \n",
- " -0.169679 | \n",
- " 0.799571 | \n",
- " 1.711086 | \n",
- " 6.434799 | \n",
- " 2.259457 | \n",
- " 5.137226 | \n",
- " -0.157376 | \n",
- " 3.739257 | \n",
- " 1.849344 | \n",
- " 0.817178 | \n",
- " 4.254727 | \n",
- " -0.031455 | \n",
- " 0.860355 | \n",
- " 3.185768 | \n",
- " 4.815537 | \n",
- " 1.889562 | \n",
- " 3.147158 | \n",
- " 5.902875 | \n",
- " 0.248916 | \n",
- " -0.013526 | \n",
- " -0.035991 | \n",
- " -0.037467 | \n",
- " 2.813852 | \n",
- " -0.033771 | \n",
- " 0.579775 | \n",
- " 2.688665 | \n",
- " 0.175669 | \n",
- " 0.990921 | \n",
- " 1.644879 | \n",
- " 0.222517 | \n",
- " -0.047864 | \n",
- " -0.000042 | \n",
- " 7.453567 | \n",
- "
\n",
- " \n",
- " | zip/59538 | \n",
- " MT | \n",
- " Phillips County | \n",
- " Malta | \n",
- " 2936 | \n",
- " 48.112019 | \n",
- " -107.845520 | \n",
- " -0.092886 | \n",
- " 1.256203 | \n",
- " -0.050897 | \n",
- " 0.321954 | \n",
- " 1.281864 | \n",
- " 0.737793 | \n",
- " 1.662178 | \n",
- " 0.451061 | \n",
- " 0.190265 | \n",
- " -0.127765 | \n",
- " 0.506115 | \n",
- " 0.792137 | \n",
- " 0.385507 | \n",
- " 0.394926 | \n",
- " 0.477761 | \n",
- " 1.028206 | \n",
- " 0.047681 | \n",
- " 0.879740 | \n",
- " 0.795730 | \n",
- " 0.239135 | \n",
- " 0.282084 | \n",
- " 0.193326 | \n",
- " 1.262094 | \n",
- " 0.453796 | \n",
- " -0.169351 | \n",
- " 0.601323 | \n",
- " 0.670364 | \n",
- " 0.581992 | \n",
- " 0.540012 | \n",
- " 0.218976 | \n",
- " 1.195483 | \n",
- " 0.035199 | \n",
- " 0.274211 | \n",
- " 0.681594 | \n",
- " 0.819916 | \n",
- " 1.234735 | \n",
- " 0.289213 | \n",
- " -0.010891 | \n",
- " 0.911312 | \n",
- " 0.780166 | \n",
- " 2.906506 | \n",
- " 0.524723 | \n",
- " 1.004237 | \n",
- " -0.098108 | \n",
- " ... | \n",
- " 0.741410 | \n",
- " 0.435825 | \n",
- " 0.415687 | \n",
- " -0.168535 | \n",
- " 1.068465 | \n",
- " -0.014837 | \n",
- " -0.058268 | \n",
- " -0.168225 | \n",
- " -0.084327 | \n",
- " 4.262060 | \n",
- " 0.444936 | \n",
- " 2.504024 | \n",
- " 0.534612 | \n",
- " 1.366006 | \n",
- " 0.086276 | \n",
- " 1.766271 | \n",
- " 3.652062 | \n",
- " -0.162912 | \n",
- " -0.102837 | \n",
- " 2.123431 | \n",
- " 6.335544 | \n",
- " -0.158536 | \n",
- " 2.916174 | \n",
- " -0.000554 | \n",
- " 4.013170 | \n",
- " 1.245277 | \n",
- " -0.146109 | \n",
- " 1.630525 | \n",
- " 0.193676 | \n",
- " -0.132476 | \n",
- " 1.447661 | \n",
- " 4.800499 | \n",
- " -0.009952 | \n",
- " 0.217168 | \n",
- " 1.960558 | \n",
- " -0.080472 | \n",
- " 0.211844 | \n",
- " -0.045951 | \n",
- " -0.012506 | \n",
- " -0.169497 | \n",
- " -0.169915 | \n",
- " -0.088829 | \n",
- " 0.338914 | \n",
- " -0.102962 | \n",
- " -0.156583 | \n",
- " 1.493696 | \n",
- " 2.259007 | \n",
- " -0.161916 | \n",
- " -0.001087 | \n",
- " 0.972243 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 336 columns
\n",
- "
\n",
- "
\n",
- "
\n"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "zipcode_embeddings"
- }
- },
- "metadata": {},
- "execution_count": 16
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Join PDFM embeddings and Groud Truth (PM2.5 data)"
- ],
- "metadata": {
- "id": "L7XAkE0fecVU"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n",
- "data.head()"
- ],
- "metadata": {
- "id": "KjwmUj5SH9-P",
- "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 429
- }
- },
- "execution_count": 17,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " zcta pm25 state county ... feature326 feature327 feature328 feature329\n",
- "zip/10001 10001 13.636975 NY New York County ... 1.581655 -0.015907 -0.0 3.977647\n",
- "zip/10002 10002 12.896079 NY New York County ... 5.388407 -0.112462 -0.0 4.431747\n",
- "zip/10003 10003 12.829412 NY New York County ... 4.039156 -0.156848 -0.0 5.094444\n",
- "zip/10004 10004 13.851765 NY New York County ... 6.799802 -0.078682 -0.0 4.140815\n",
- "zip/10005 10005 13.730000 NY New York County ... 7.295258 -0.169108 -0.0 3.934241\n",
- "\n",
- "[5 rows x 338 columns]"
- ],
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " zcta | \n",
- " pm25 | \n",
- " state | \n",
- " county | \n",
- " city | \n",
- " population | \n",
- " latitude | \n",
- " longitude | \n",
- " feature0 | \n",
- " feature1 | \n",
- " feature2 | \n",
- " feature3 | \n",
- " feature4 | \n",
- " feature5 | \n",
- " feature6 | \n",
- " feature7 | \n",
- " feature8 | \n",
- " feature9 | \n",
- " feature10 | \n",
- " feature11 | \n",
- " feature12 | \n",
- " feature13 | \n",
- " feature14 | \n",
- " feature15 | \n",
- " feature16 | \n",
- " feature17 | \n",
- " feature18 | \n",
- " feature19 | \n",
- " feature20 | \n",
- " feature21 | \n",
- " feature22 | \n",
- " feature23 | \n",
- " feature24 | \n",
- " feature25 | \n",
- " feature26 | \n",
- " feature27 | \n",
- " feature28 | \n",
- " feature29 | \n",
- " feature30 | \n",
- " feature31 | \n",
- " feature32 | \n",
- " feature33 | \n",
- " feature34 | \n",
- " feature35 | \n",
- " feature36 | \n",
- " feature37 | \n",
- " feature38 | \n",
- " feature39 | \n",
- " feature40 | \n",
- " feature41 | \n",
- " ... | \n",
- " feature280 | \n",
- " feature281 | \n",
- " feature282 | \n",
- " feature283 | \n",
- " feature284 | \n",
- " feature285 | \n",
- " feature286 | \n",
- " feature287 | \n",
- " feature288 | \n",
- " feature289 | \n",
- " feature290 | \n",
- " feature291 | \n",
- " feature292 | \n",
- " feature293 | \n",
- " feature294 | \n",
- " feature295 | \n",
- " feature296 | \n",
- " feature297 | \n",
- " feature298 | \n",
- " feature299 | \n",
- " feature300 | \n",
- " feature301 | \n",
- " feature302 | \n",
- " feature303 | \n",
- " feature304 | \n",
- " feature305 | \n",
- " feature306 | \n",
- " feature307 | \n",
- " feature308 | \n",
- " feature309 | \n",
- " feature310 | \n",
- " feature311 | \n",
- " feature312 | \n",
- " feature313 | \n",
- " feature314 | \n",
- " feature315 | \n",
- " feature316 | \n",
- " feature317 | \n",
- " feature318 | \n",
- " feature319 | \n",
- " feature320 | \n",
- " feature321 | \n",
- " feature322 | \n",
- " feature323 | \n",
- " feature324 | \n",
- " feature325 | \n",
- " feature326 | \n",
- " feature327 | \n",
- " feature328 | \n",
- " feature329 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | zip/10001 | \n",
- " 10001 | \n",
- " 13.636975 | \n",
- " NY | \n",
- " New York County | \n",
- " New York | \n",
- " 26966 | \n",
- " 40.750672 | \n",
- " -73.997281 | \n",
- " -0.073025 | \n",
- " 0.621611 | \n",
- " 3.204059 | \n",
- " 1.219250 | \n",
- " 0.086666 | \n",
- " 0.422733 | \n",
- " 1.362252 | \n",
- " 0.448227 | \n",
- " 0.780789 | \n",
- " 0.076176 | \n",
- " 0.744889 | \n",
- " 0.206153 | \n",
- " 1.486222 | \n",
- " 2.341626 | \n",
- " 2.934966 | \n",
- " 3.197650 | \n",
- " 2.227243 | \n",
- " 0.536704 | \n",
- " 1.641904 | \n",
- " 0.215733 | \n",
- " 0.084727 | \n",
- " 0.164679 | \n",
- " 4.511102 | \n",
- " 2.252191 | \n",
- " -0.024315 | \n",
- " 0.212265 | \n",
- " 3.857031 | \n",
- " 2.217771 | \n",
- " 0.308766 | \n",
- " 4.034735 | \n",
- " 1.695519 | \n",
- " 0.567224 | \n",
- " -0.077346 | \n",
- " 3.953824 | \n",
- " 1.425099 | \n",
- " 2.275840 | \n",
- " 1.042735 | \n",
- " 1.490175 | \n",
- " 1.557218 | \n",
- " 0.206169 | \n",
- " 0.049573 | \n",
- " 2.313412 | \n",
- " ... | \n",
- " 1.501178 | \n",
- " -0.123201 | \n",
- " -0.167260 | \n",
- " -0.159472 | \n",
- " -0.001858 | \n",
- " -0.129980 | \n",
- " 0.175728 | \n",
- " -1.200000e-07 | \n",
- " -0.008414 | \n",
- " -0.155363 | \n",
- " 0.315214 | \n",
- " -0.009080 | \n",
- " -0.0 | \n",
- " -0.000000e+00 | \n",
- " 0.770695 | \n",
- " -0.152233 | \n",
- " -0.049659 | \n",
- " 8.209490 | \n",
- " 0.700820 | \n",
- " 1.000827 | \n",
- " 5.112181 | \n",
- " -0.146953 | \n",
- " 1.654920 | \n",
- " -0.002328 | \n",
- " -0.004713 | \n",
- " -0.162020 | \n",
- " 9.814237 | \n",
- " 2.872081 | \n",
- " 7.905857 | \n",
- " 9.632621 | \n",
- " 7.280638 | \n",
- " -0.017287 | \n",
- " 0.479747 | \n",
- " 1.191547 | \n",
- " 6.030477 | \n",
- " -0.028483 | \n",
- " -0.002579 | \n",
- " -0.150458 | \n",
- " -0.145124 | \n",
- " -0.000732 | \n",
- " -0.006855 | \n",
- " 2.571859 | \n",
- " -0.006076 | \n",
- " 4.313338 | \n",
- " -0.105290 | \n",
- " -9.500000e-06 | \n",
- " 1.581655 | \n",
- " -0.015907 | \n",
- " -0.0 | \n",
- " 3.977647 | \n",
- "
\n",
- " \n",
- " | zip/10002 | \n",
- " 10002 | \n",
- " 12.896079 | \n",
- " NY | \n",
- " New York County | \n",
- " New York | \n",
- " 76807 | \n",
- " 40.715762 | \n",
- " -73.986258 | \n",
- " -0.144769 | \n",
- " 1.102272 | \n",
- " 3.980774 | \n",
- " 1.751827 | \n",
- " 0.082982 | \n",
- " 0.474515 | \n",
- " 0.949866 | \n",
- " 0.379872 | \n",
- " 1.394939 | \n",
- " 0.239189 | \n",
- " 0.831812 | \n",
- " 0.231631 | \n",
- " 1.985357 | \n",
- " 2.112055 | \n",
- " 3.209443 | \n",
- " 3.651924 | \n",
- " 1.369114 | \n",
- " 0.449409 | \n",
- " 2.278484 | \n",
- " 0.187911 | \n",
- " 0.342743 | \n",
- " 0.178030 | \n",
- " 5.858740 | \n",
- " 2.435344 | \n",
- " -0.011125 | \n",
- " 0.203172 | \n",
- " 3.586207 | \n",
- " 2.016004 | \n",
- " 0.292903 | \n",
- " 4.168702 | \n",
- " 2.146975 | \n",
- " 1.026136 | \n",
- " -0.085150 | \n",
- " 4.931133 | \n",
- " 1.664730 | \n",
- " 2.572370 | \n",
- " 0.716705 | \n",
- " 2.225487 | \n",
- " 2.063185 | \n",
- " 0.276354 | \n",
- " 0.272357 | \n",
- " 3.370650 | \n",
- " ... | \n",
- " 0.361146 | \n",
- " -0.096761 | \n",
- " -0.146408 | \n",
- " -0.155304 | \n",
- " -0.074113 | \n",
- " -0.015054 | \n",
- " 1.665584 | \n",
- " -1.102000e-05 | \n",
- " -0.002935 | \n",
- " -0.169603 | \n",
- " 0.145326 | \n",
- " -0.025443 | \n",
- " -0.0 | \n",
- " -5.000000e-08 | \n",
- " -0.161510 | \n",
- " -0.153942 | \n",
- " -0.166410 | \n",
- " 6.376185 | \n",
- " 1.163428 | \n",
- " 1.138928 | \n",
- " 5.270078 | \n",
- " -0.159924 | \n",
- " 1.996964 | \n",
- " -0.000041 | \n",
- " -0.020128 | \n",
- " -0.050640 | \n",
- " 8.771538 | \n",
- " 2.086100 | \n",
- " 8.602596 | \n",
- " 8.060193 | \n",
- " 8.714463 | \n",
- " -0.004688 | \n",
- " 0.136199 | \n",
- " 0.513982 | \n",
- " 5.643821 | \n",
- " -0.002350 | \n",
- " -0.008118 | \n",
- " -0.089103 | \n",
- " -0.156498 | \n",
- " -0.034799 | \n",
- " -0.103022 | \n",
- " 2.710451 | \n",
- " -0.000021 | \n",
- " 4.655519 | \n",
- " -0.159014 | \n",
- " -0.000000e+00 | \n",
- " 5.388407 | \n",
- " -0.112462 | \n",
- " -0.0 | \n",
- " 4.431747 | \n",
- "
\n",
- " \n",
- " | zip/10003 | \n",
- " 10003 | \n",
- " 12.829412 | \n",
- " NY | \n",
- " New York County | \n",
- " New York | \n",
- " 54447 | \n",
- " 40.731829 | \n",
- " -73.989181 | \n",
- " -0.131461 | \n",
- " 1.132039 | \n",
- " 3.512190 | \n",
- " 1.176800 | \n",
- " 0.161716 | \n",
- " 0.459370 | \n",
- " 1.291074 | \n",
- " 0.475496 | \n",
- " 1.146470 | \n",
- " 0.199002 | \n",
- " 0.968685 | \n",
- " 0.249058 | \n",
- " 1.788139 | \n",
- " 2.295915 | \n",
- " 3.039128 | \n",
- " 3.726780 | \n",
- " 1.822461 | \n",
- " 0.546915 | \n",
- " 1.955472 | \n",
- " 0.188880 | \n",
- " 0.198170 | \n",
- " 0.138593 | \n",
- " 5.482426 | \n",
- " 2.644991 | \n",
- " -0.030993 | \n",
- " 0.256750 | \n",
- " 3.747168 | \n",
- " 2.160955 | \n",
- " 0.311789 | \n",
- " 4.131626 | \n",
- " 2.050255 | \n",
- " 0.924002 | \n",
- " -0.077789 | \n",
- " 4.401683 | \n",
- " 1.490332 | \n",
- " 2.379128 | \n",
- " 0.871206 | \n",
- " 2.151970 | \n",
- " 1.929199 | \n",
- " 0.361298 | \n",
- " 0.229650 | \n",
- " 3.038753 | \n",
- " ... | \n",
- " 0.461322 | \n",
- " -0.141985 | \n",
- " -0.085863 | \n",
- " -0.156253 | \n",
- " -0.005502 | \n",
- " -0.046846 | \n",
- " 1.596130 | \n",
- " -4.810000e-06 | \n",
- " -0.089342 | \n",
- " -0.099540 | \n",
- " 0.272569 | \n",
- " -0.009312 | \n",
- " -0.0 | \n",
- " -0.000000e+00 | \n",
- " 0.092946 | \n",
- " -0.169363 | \n",
- " -0.042557 | \n",
- " 6.376621 | \n",
- " 0.606484 | \n",
- " 0.778049 | \n",
- " 5.227136 | \n",
- " -0.169888 | \n",
- " 2.206395 | \n",
- " -0.000619 | \n",
- " -0.166381 | \n",
- " -0.167345 | \n",
- " 8.366828 | \n",
- " 2.290446 | \n",
- " 7.762557 | \n",
- " 7.947579 | \n",
- " 6.410265 | \n",
- " -0.020947 | \n",
- " 0.935508 | \n",
- " 0.415192 | \n",
- " 5.355077 | \n",
- " -0.027503 | \n",
- " -0.000485 | \n",
- " -0.123136 | \n",
- " 0.030198 | \n",
- " -0.001412 | \n",
- " -0.004739 | \n",
- " 2.824203 | \n",
- " -0.000298 | \n",
- " 4.061482 | \n",
- " -0.068253 | \n",
- " -1.350000e-06 | \n",
- " 4.039156 | \n",
- " -0.156848 | \n",
- " -0.0 | \n",
- " 5.094444 | \n",
- "
\n",
- " \n",
- " | zip/10004 | \n",
- " 10004 | \n",
- " 13.851765 | \n",
- " NY | \n",
- " New York County | \n",
- " New York | \n",
- " 4795 | \n",
- " 40.688630 | \n",
- " -74.018244 | \n",
- " -0.147625 | \n",
- " 0.546787 | \n",
- " 3.229492 | \n",
- " 1.886864 | \n",
- " 0.108621 | \n",
- " 0.603010 | \n",
- " 1.442713 | \n",
- " 0.471150 | \n",
- " 1.028882 | \n",
- " 0.411730 | \n",
- " 0.708382 | \n",
- " 0.346837 | \n",
- " 1.798658 | \n",
- " 1.940426 | \n",
- " 3.435431 | \n",
- " 2.693004 | \n",
- " 1.956380 | \n",
- " 0.300429 | \n",
- " 2.173964 | \n",
- " 0.440953 | \n",
- " 0.339811 | \n",
- " 0.275773 | \n",
- " 5.271566 | \n",
- " 2.132799 | \n",
- " -0.018342 | \n",
- " 0.398195 | \n",
- " 3.207484 | \n",
- " 1.488841 | \n",
- " 0.526957 | \n",
- " 4.330931 | \n",
- " 1.806410 | \n",
- " 1.111506 | \n",
- " -0.118441 | \n",
- " 4.121751 | \n",
- " 1.604998 | \n",
- " 1.996196 | \n",
- " 0.674802 | \n",
- " 1.756082 | \n",
- " 2.741566 | \n",
- " 0.211858 | \n",
- " 0.289891 | \n",
- " 2.604892 | \n",
- " ... | \n",
- " 0.393506 | \n",
- " -0.142817 | \n",
- " -0.135064 | \n",
- " -0.018993 | \n",
- " -0.057043 | \n",
- " -0.131630 | \n",
- " 0.758817 | \n",
- " -0.000000e+00 | \n",
- " -0.000007 | \n",
- " -0.134696 | \n",
- " 0.017068 | \n",
- " -0.007268 | \n",
- " -0.0 | \n",
- " -3.080000e-06 | \n",
- " -0.152431 | \n",
- " -0.105453 | \n",
- " -0.141028 | \n",
- " 1.679086 | \n",
- " 0.534155 | \n",
- " 1.851612 | \n",
- " 5.175022 | \n",
- " 1.438296 | \n",
- " 2.622611 | \n",
- " -0.000090 | \n",
- " -0.004313 | \n",
- " -0.119687 | \n",
- " 2.166597 | \n",
- " 3.767899 | \n",
- " 8.035578 | \n",
- " 4.923587 | \n",
- " 6.152561 | \n",
- " -0.007616 | \n",
- " -0.072199 | \n",
- " 0.052521 | \n",
- " 3.980608 | \n",
- " -0.000069 | \n",
- " -0.000103 | \n",
- " -0.027072 | \n",
- " 0.367801 | \n",
- " -0.003337 | \n",
- " -0.010422 | \n",
- " 1.127201 | \n",
- " -0.000079 | \n",
- " 3.496159 | \n",
- " -0.079587 | \n",
- " -8.000000e-08 | \n",
- " 6.799802 | \n",
- " -0.078682 | \n",
- " -0.0 | \n",
- " 4.140815 | \n",
- "
\n",
- " \n",
- " | zip/10005 | \n",
- " 10005 | \n",
- " 13.730000 | \n",
- " NY | \n",
- " New York County | \n",
- " New York | \n",
- " 8637 | \n",
- " 40.705974 | \n",
- " -74.008768 | \n",
- " -0.142861 | \n",
- " 0.778521 | \n",
- " 3.640563 | \n",
- " 1.774548 | \n",
- " 0.103140 | \n",
- " 0.462198 | \n",
- " 1.238112 | \n",
- " 0.362621 | \n",
- " 1.067066 | \n",
- " 0.240392 | \n",
- " 1.012144 | \n",
- " 0.450193 | \n",
- " 2.065349 | \n",
- " 2.076180 | \n",
- " 3.030134 | \n",
- " 3.465373 | \n",
- " 1.888940 | \n",
- " 0.374023 | \n",
- " 2.323722 | \n",
- " 0.307162 | \n",
- " 0.243969 | \n",
- " 0.205338 | \n",
- " 5.640188 | \n",
- " 2.588103 | \n",
- " -0.015375 | \n",
- " 0.324594 | \n",
- " 3.737864 | \n",
- " 1.677935 | \n",
- " 0.379659 | \n",
- " 4.378235 | \n",
- " 2.042296 | \n",
- " 1.018322 | \n",
- " -0.109311 | \n",
- " 4.715398 | \n",
- " 1.757364 | \n",
- " 2.500815 | \n",
- " 0.888996 | \n",
- " 1.959577 | \n",
- " 2.203878 | \n",
- " 0.227336 | \n",
- " 0.243604 | \n",
- " 3.124810 | \n",
- " ... | \n",
- " -0.117290 | \n",
- " -0.084119 | \n",
- " -0.169264 | \n",
- " -0.115257 | \n",
- " -0.010591 | \n",
- " -0.026248 | \n",
- " 1.397036 | \n",
- " -0.000000e+00 | \n",
- " -0.000242 | \n",
- " -0.167469 | \n",
- " 0.550521 | \n",
- " -0.013257 | \n",
- " -0.0 | \n",
- " -3.500000e-07 | \n",
- " -0.156989 | \n",
- " 0.212693 | \n",
- " -0.133220 | \n",
- " 3.382343 | \n",
- " 0.718064 | \n",
- " 0.732648 | \n",
- " 5.014384 | \n",
- " 0.157389 | \n",
- " 2.529759 | \n",
- " -0.000866 | \n",
- " -0.002382 | \n",
- " -0.072132 | \n",
- " 3.905590 | \n",
- " 2.831104 | \n",
- " 8.165786 | \n",
- " 6.136455 | \n",
- " 6.366154 | \n",
- " -0.007492 | \n",
- " 0.567445 | \n",
- " -0.101746 | \n",
- " 3.834963 | \n",
- " -0.009529 | \n",
- " -0.000505 | \n",
- " 0.028645 | \n",
- " 1.454115 | \n",
- " -0.001952 | \n",
- " -0.011845 | \n",
- " 2.385040 | \n",
- " -0.000017 | \n",
- " 3.454340 | \n",
- " -0.159854 | \n",
- " -1.320000e-06 | \n",
- " 7.295258 | \n",
- " -0.169108 | \n",
- " -0.0 | \n",
- " 3.934241 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 338 columns
\n",
- "
\n",
- "
\n",
- "
\n"
- ],
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "dataframe",
- "variable_name": "data"
- }
- },
- "metadata": {},
- "execution_count": 17
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "data.shape"
- ],
- "metadata": {
- "id": "UUKLoxY6JkUt",
- "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2",
- "colab": {
- "base_uri": "https://localhost:8080/"
- }
- },
- "execution_count": 18,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "(29298, 338)"
- ]
- },
- "metadata": {},
- "execution_count": 18
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "embedding_features = [f\"feature{x}\" for x in range(330)]\n",
- "label = \"pm25\""
- ],
- "metadata": {
- "id": "NvHQ97WYJmOd"
- },
- "execution_count": 19,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "data = data.dropna(subset=[label])"
- ],
- "metadata": {
- "id": "35igKORxJ15A"
- },
- "execution_count": 20,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Split Train and Test Data"
- ],
- "metadata": {
- "id": "tuCYm8dcevqo"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "\n",
- "data = data[embedding_features + [label]]\n",
- "X = data[embedding_features]\n",
- "y = data[label]\n",
- "\n",
- "X_train, X_test, y_train, y_test = train_test_split(\n",
- " X, y, test_size=0.2, random_state=42\n",
- ")"
- ],
- "metadata": {
- "id": "IgSmPhJTJ2QW"
- },
- "execution_count": 21,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Fit K-Nearest Neighbors Model"
- ],
- "metadata": {
- "id": "iHys75z1fFFJ"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "k = 5\n",
- "model = KNeighborsRegressor(n_neighbors=k)\n",
- "model.fit(X_train, y_train)\n",
- "\n",
- "y_pred = model.predict(X_test)"
- ],
- "metadata": {
- "id": "KCgs-cyoJ5Nm"
- },
- "execution_count": 22,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "\n",
- "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
- "# Evaluate the model\n",
- "metrics = evaluate_model(evaluation_df)\n",
- "print(metrics)"
- ],
- "metadata": {
- "id": "TkWTkqPCKEYu",
- "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9",
- "colab": {
- "base_uri": "https://localhost:8080/"
- }
- },
- "execution_count": 23,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "{'r2': 0.9423382724431623, 'r': 0.9708871328512507, 'rmse': 0.6494730499555952, 'mae': 0.4204252179218217, 'mape': 0.05619889943563576}\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Evaluate K-Nearest Neighbors Model"
- ],
- "metadata": {
- "id": "ywbC7hLSfMZB"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "xy_lim = (0, 30)\n",
- "plot_actual_vs_predicted(\n",
- " evaluation_df,\n",
- " xlim=xy_lim,\n",
- " ylim=xy_lim,\n",
- " title=\"Actual vs Predicted PM2.5\",\n",
- " x_label=\"Actual PM2.5\",\n",
- " y_label=\"Predicted PM2.5\",\n",
- ")"
- ],
- "metadata": {
- "id": "LZORS0JQKjQ4",
- "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 817
- }
- },
- "execution_count": 28,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- ""
- ]
- },
- "metadata": {}
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [],
- "metadata": {
- "id": "SjOVAvPYkDYx"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Fit Random Forest Regressor model"
- ],
- "metadata": {
- "id": "LPXkNlenjGp_"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "model = RandomForestRegressor(n_estimators=10,verbose=10,n_jobs=-1)\n",
- "model.fit(X_train, y_train)\n",
- "\n",
- "y_pred = model.predict(X_test)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "M7S4naYnjN4l",
- "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9"
- },
- "execution_count": 29,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "building tree 1 of 10building tree 2 of 10\n",
- "\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[Parallel(n_jobs=-1)]: Done 1 tasks | elapsed: 20.0s\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "building tree 3 of 10\n",
- "building tree 4 of 10\n",
- "building tree 5 of 10\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[Parallel(n_jobs=-1)]: Done 4 tasks | elapsed: 38.8s\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "building tree 6 of 10\n",
- "building tree 7 of 10\n",
- "building tree 8 of 10\n",
- "building tree 9 of 10\n",
- "building tree 10 of 10\n"
- ]
- },
- {
- "output_type": "stream",
- "name": "stderr",
- "text": [
- "[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 1.6min finished\n",
- "[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.\n",
- "[Parallel(n_jobs=2)]: Done 1 tasks | elapsed: 0.0s\n",
- "[Parallel(n_jobs=2)]: Done 4 tasks | elapsed: 0.0s\n",
- "[Parallel(n_jobs=2)]: Done 10 out of 10 | elapsed: 0.0s finished\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
- "# Evaluate the model\n",
- "metrics = evaluate_model(evaluation_df)\n",
- "print(metrics)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "CaxFfeEtjRiH",
- "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3"
- },
- "execution_count": 30,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "{'r2': 0.9086928793143653, 'r': 0.9539294211355118, 'rmse': 0.817277670967286, 'mae': 0.568816793219495, 'mape': 0.07572915522396176}\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Evaluate Random Forest Model"
- ],
- "metadata": {
- "id": "v96jVyGRjWtE"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "xy_lim = (0, 30)\n",
- "plot_actual_vs_predicted(\n",
- " evaluation_df,\n",
- " xlim=xy_lim,\n",
- " ylim=xy_lim,\n",
- " title=\"Actual vs Predicted PM2.5\",\n",
- " x_label=\"Actual PM2.5\",\n",
- " y_label=\"Predicted PM2.5\",\n",
- ")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 817
- },
- "id": "bVnYyd-zjb85",
- "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d"
- },
- "execution_count": 31,
- "outputs": [
- {
- "output_type": "display_data",
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- ""
- ]
- },
- "metadata": {}
- }
- ]
- }
- ],
- "metadata": {
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tE3akitQdA-m"
+ },
+ "source": [
+ "**Predicting US PM2.5 levels using Google's Population Dynamics Foundation Model**\n",
+ "\n",
+ "Useful Resources:\n",
+ "1. https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb\n",
+ "2. https://github.com/google-research/population-dynamics/tree/master/notebooks"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "TbTs9lKqddKS"
+ },
+ "source": [
+ "Acknowledgements:\n",
+ "This notebook is based on tutorials - [PDFM notebook](https://github.com/google-research/population-dynamics/tree/master/notebooks) and awesome tutorial by giswqs opengeos PDFM [zillow home price](https://github.com/opengeos/GeoAI-Tutorials/blob/main/docs/PDFM/zillow_home_value.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JqjIF4kAKZGR"
+ },
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "!pip install leafmap"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "lIYdn1woOS1n"
+ },
+ "outputs": [],
+ "source": [
+ "# import libraries\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.ensemble import RandomForestRegressor\n",
+ "from sklearn.neighbors import KNeighborsRegressor\n",
+ "from leafmap.common import evaluate_model, plot_actual_vs_predicted, download_file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0z3FKT1fgePa"
+ },
+ "source": [
+ "# Get US PM2.5 data\n",
+ "Link to data: https://usc-geohealth-hub-uscssi.hub.arcgis.com/documents/7fc448343d6643f3bb13157fd65aed4f/about"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "8VYkB_TeA0kP"
+ },
+ "outputs": [],
+ "source": [
+ "df0 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part1\")\n",
+ "df1 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part2\")\n",
+ "df2 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part3\")\n",
+ "df3 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part4\")\n",
+ "df4 = pd.read_excel(\"/content/pm25_and_disparity.xlsx\", sheet_name=\"data_part5\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GYC_GWkdgodj"
+ },
+ "source": [
+ "# Process PM2.5 data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
"colab": {
- "provenance": [],
- "include_colab_link": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "QE3MZiRYA6q1",
+ "outputId": "b194649f-7e4a-4dfe-e1ed-8bc825fd5cb5"
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.concat([df0, df1, df2, df3, df4], ignore_index=True)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "p9NCYMyxGBQB",
+ "outputId": "530a1697-c230-4199-d9f2-1cd07130d07e"
+ },
+ "outputs": [],
+ "source": [
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ip7NBwl-EDsf",
+ "outputId": "4f5b299d-b6a3-4859-fa0a-f14b96bf3221"
+ },
+ "outputs": [],
+ "source": [
+ "df[\"zcta\"].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 272
+ },
+ "id": "8qtI9VgiCcvj",
+ "outputId": "c2365f08-5de6-45aa-a4fb-1d5b3f7a86a4"
+ },
+ "outputs": [],
+ "source": [
+ "pm25_df = df.groupby([\"zcta\"]).mean()[\"pm25\"]\n",
+ "pm25_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 272
+ },
+ "id": "6-qSy8lVDDoM",
+ "outputId": "1f086442-221a-4f64-ee9d-db456bb21808"
+ },
+ "outputs": [],
+ "source": [
+ "pm25_df.dropna(axis=0, inplace=True)\n",
+ "pm25_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 290
+ },
+ "id": "5kAxQ3uVGswD",
+ "outputId": "5c326987-a0c0-4f4b-f6ec-8094288375c3"
+ },
+ "outputs": [],
+ "source": [
+ "pm25_df.index = pm25_df.index.astype(int)\n",
+ "print(pm25_df.shape)\n",
+ "pm25_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 238
+ },
+ "id": "dBOUhYaOIU0-",
+ "outputId": "32a40749-015a-4d2a-b27f-0ac61bf8b7dc"
+ },
+ "outputs": [],
+ "source": [
+ "pm25_df = pm25_df.reset_index(drop=False) # Remove inplace=True\n",
+ "pm25_df.index = pm25_df[\"zcta\"].apply(lambda x: f\"zip/{x}\") # Access 'zcta' column\n",
+ "pm25_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0CUwjIxaeWCm"
+ },
+ "source": [
+ "# Request access to PDFM Embeddings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "yxEkU1GjibgC",
+ "outputId": "b0a29cdb-1b14-49d1-9302-604ca2ead218"
+ },
+ "outputs": [],
+ "source": [
+ "!unzip /content/pdfm_embeddings.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-DugeFaSG2Pi"
+ },
+ "outputs": [],
+ "source": [
+ "embeddings_file_path = \"/content/pdfm_embeddings/v0/us/zcta_embeddings.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JZZX7tzlHYa9"
+ },
+ "outputs": [],
+ "source": [
+ "if not os.path.exists(embeddings_file_path):\n",
+ " raise FileNotFoundError(\"Please request the embeddings from Google\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 374
+ },
+ "id": "cWNluHJYHY57",
+ "outputId": "3d62af41-8d41-40c7-ab82-5b9174406cfd"
+ },
+ "outputs": [],
+ "source": [
+ "zipcode_embeddings = pd.read_csv(embeddings_file_path).set_index(\"place\")\n",
+ "zipcode_embeddings.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "L7XAkE0fecVU"
+ },
+ "source": [
+ "# Join PDFM embeddings and Groud Truth (PM2.5 data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 429
+ },
+ "id": "KjwmUj5SH9-P",
+ "outputId": "626b6758-821c-486e-9f2b-0b9f3366f433"
+ },
+ "outputs": [],
+ "source": [
+ "data = pm25_df.join(zipcode_embeddings, how=\"inner\")\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UUKLoxY6JkUt",
+ "outputId": "6518e2c5-8dff-4d07-851f-c486f2da50b2"
+ },
+ "outputs": [],
+ "source": [
+ "data.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "NvHQ97WYJmOd"
+ },
+ "outputs": [],
+ "source": [
+ "embedding_features = [f\"feature{x}\" for x in range(330)]\n",
+ "label = \"pm25\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "35igKORxJ15A"
+ },
+ "outputs": [],
+ "source": [
+ "data = data.dropna(subset=[label])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "tuCYm8dcevqo"
+ },
+ "source": [
+ "# Split Train and Test Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "IgSmPhJTJ2QW"
+ },
+ "outputs": [],
+ "source": [
+ "data = data[embedding_features + [label]]\n",
+ "X = data[embedding_features]\n",
+ "y = data[label]\n",
+ "\n",
+ "X_train, X_test, y_train, y_test = train_test_split(\n",
+ " X, y, test_size=0.2, random_state=42\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iHys75z1fFFJ"
+ },
+ "source": [
+ "# Fit K-Nearest Neighbors Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KCgs-cyoJ5Nm"
+ },
+ "outputs": [],
+ "source": [
+ "k = 5\n",
+ "model = KNeighborsRegressor(n_neighbors=k)\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = model.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TkWTkqPCKEYu",
+ "outputId": "01404579-141a-41c3-e7ac-d1576130f5f9"
+ },
+ "outputs": [],
+ "source": [
+ "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
+ "# Evaluate the model\n",
+ "metrics = evaluate_model(evaluation_df)\n",
+ "print(metrics)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ywbC7hLSfMZB"
+ },
+ "source": [
+ "# Evaluate K-Nearest Neighbors Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 817
+ },
+ "id": "LZORS0JQKjQ4",
+ "outputId": "05aa4e15-7008-4663-a364-4406ef00a5a3"
+ },
+ "outputs": [],
+ "source": [
+ "xy_lim = (0, 30)\n",
+ "plot_actual_vs_predicted(\n",
+ " evaluation_df,\n",
+ " xlim=xy_lim,\n",
+ " ylim=xy_lim,\n",
+ " title=\"Actual vs Predicted PM2.5\",\n",
+ " x_label=\"Actual PM2.5\",\n",
+ " y_label=\"Predicted PM2.5\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "SjOVAvPYkDYx"
+ },
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LPXkNlenjGp_"
+ },
+ "source": [
+ "# Fit Random Forest Regressor model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "M7S4naYnjN4l",
+ "outputId": "55c7d7d8-b304-4fc2-d8f6-5ed014577bf9"
+ },
+ "outputs": [],
+ "source": [
+ "model = RandomForestRegressor(n_estimators=10, verbose=10, n_jobs=-1)\n",
+ "model.fit(X_train, y_train)\n",
+ "\n",
+ "y_pred = model.predict(X_test)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "CaxFfeEtjRiH",
+ "outputId": "590e443f-6f93-46d9-ccfe-d1723600d0a3"
+ },
+ "outputs": [],
+ "source": [
+ "evaluation_df = pd.DataFrame({\"y\": y_test, \"y_pred\": y_pred})\n",
+ "# Evaluate the model\n",
+ "metrics = evaluate_model(evaluation_df)\n",
+ "print(metrics)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "v96jVyGRjWtE"
+ },
+ "source": [
+ "# Evaluate Random Forest Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 817
+ },
+ "id": "bVnYyd-zjb85",
+ "outputId": "76b5b25d-a9d7-4c16-c1d7-015e814f278d"
+ },
+ "outputs": [],
+ "source": [
+ "xy_lim = (0, 30)\n",
+ "plot_actual_vs_predicted(\n",
+ " evaluation_df,\n",
+ " xlim=xy_lim,\n",
+ " ylim=xy_lim,\n",
+ " title=\"Actual vs Predicted PM2.5\",\n",
+ " x_label=\"Actual PM2.5\",\n",
+ " y_label=\"Predicted PM2.5\",\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "include_colab_link": true,
+ "provenance": []
},
- "nbformat": 4,
- "nbformat_minor": 0
-}
\ No newline at end of file
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}