# Extract coordinates from nfi data


In [1]:
import pandas as pd
import glob
from datetime import datetime
import os.path
import geopandas as gpd
from shapely.geometry import Point
from pyprojroot.here import here

import sys

sys.path.insert(0, "../../src")
from nfi_wrangling import *

## Get latest data


In [2]:
# FILEPATH: /Users/pascal/repos/padasch/ifn_analysis/python/00_process_nfi_data/extract_final_coordinates.ipynb
# Get all files with nfi_dataset_for_analysis in the name
nfi_data_raw = get_latest_nfi_raw_data()
# Display the data
nfi_data_raw

👉 Latest file is 20231201-103649_nfi_dataset_for_analysis copy.csv
👉 Created on Wednesday 2023-12-13, 17:49 which is 2 days ago.


  nfi_data_raw = pd.read_csv(latest_file, index_col=0)


# Save files with IDP (potentially duplicated coords)


In [3]:
nfi_data = (
    nfi_data_raw[["idp", "visit_1", "lon", "lat", "lon_fr", "lat_fr"]]
    .rename(
        {
            "visit_1": "first_year",
            "lon": "x",
            "lat": "y",
            "lon_fr": "x_fr",
            "lat_fr": "y_fr",
        },
        axis=1,
    )
    .drop_duplicates()
    .reset_index(drop=True)
)

# Create data with idp: For later merging of nfi data and predictor data
nfi_data_with_idp = (
    nfi_data.reset_index(drop=False)
    .rename(columns={"index": "SiteID"})
    .assign(SiteID=lambda x: x["SiteID"] + 1)
)

# Fix order (not sure if this has downstream effects but keeping it like it for now...)
nfi_data_with_idp = nfi_data_with_idp[
    ["first_year", "SiteID", "x", "y", "idp", "x_fr", "y_fr"]
]
nfi_data_with_idp.to_csv("nfi_final_sites_with_idp.csv", index=False)
# nfi_data_with_idp

In [7]:
# Also save as geojson
gdf = gpd.GeoDataFrame(
    nfi_data_with_idp,
    geometry=[Point(xy) for xy in zip(nfi_data_with_idp.x_fr, nfi_data_with_idp.y_fr)],
    crs="EPSG:2154",  # Set the coordinate reference system to RGF93 / Lambert-93 projection
)
gdf.to_file("nfi_final_sites_with_idp_epsg2154.geojson", driver="GeoJSON")

# Save File without IDP (no duplicated coords)


In [9]:
# Create data without idp: For faster extraction of predictor data
nfi_data_without_idp = (
    nfi_data.drop(columns="idp")
    .drop_duplicates()
    .reset_index(drop=False)
    .rename(columns={"index": "SiteID"})
    .assign(SiteID=lambda x: x["SiteID"] + 1)
)

# Fix order
nfi_data_without_idp = nfi_data_without_idp[
    ["first_year", "SiteID", "x", "y", "x_fr", "y_fr"]
]
nfi_data_without_idp.to_csv("nfi_final_sites_without_idp.csv", index=False)

In [10]:
# Also save as geojson
gdf = gpd.GeoDataFrame(
    nfi_data_without_idp,
    geometry=[
        Point(xy) for xy in zip(nfi_data_without_idp.x_fr, nfi_data_without_idp.y_fr)
    ],
    crs="EPSG:2154",  # Set the coordinate reference system to RGF93 / Lambert-93 projection
)
gdf.to_file("nfi_final_sites_without_idp_epsg2154.geojson", driver="GeoJSON")

# Check re-reading files


In [66]:
pd.read_csv("nfi_final_sites_with_idp.csv")

Unnamed: 0,first_year,SiteID,x,y,idp,x_fr,y_fr
0,2011,1,-2.842824,48.337505,632691,2.674150e+05,6.820144e+06
1,2012,2,3.349757,46.198025,702597,7.269712e+05,6.566524e+06
2,2012,3,3.361577,46.827747,706240,7.275625e+05,6.636462e+06
3,2012,4,0.402182,48.201563,708321,5.070276e+05,6.792198e+06
4,2012,5,6.461081,43.281648,708369,9.810095e+05,6.248657e+06
...,...,...,...,...,...,...,...
40017,2016,40018,3.380200,47.790156,1131396,7.284673e+05,6.743375e+06
40018,2016,40019,2.156438,42.686748,1131409,6.307829e+05,6.176717e+06
40019,2016,40020,7.424664,47.899971,1131410,1.030439e+06,6.764769e+06
40020,2016,40021,3.757202,44.924973,1131424,7.597461e+05,6.425373e+06


In [61]:
pd.read_csv("nfi_final_sites_without_idp.csv")

Unnamed: 0,first_year,SiteID,x,y
0,2011,1,-2.842824,48.337505
1,2012,2,3.349757,46.198025
2,2012,3,3.361577,46.827747
3,2012,4,0.402182,48.201563
4,2012,5,6.461081,43.281648
...,...,...,...,...
39280,2016,40018,3.380200,47.790156
39281,2016,40019,2.156438,42.686748
39282,2016,40020,7.424664,47.899971
39283,2016,40021,3.757202,44.924973


---

Code below was to fix old formatting structure of the data. Do not rerun


In [51]:
# old_data = pd.read_csv(here("python/01_download_raw_gee_data/sites_years.csv"))
# old_data = old_data.rename(columns={"first_visit": "first_year", "id": "SiteID"})
# # old_data

In [52]:
# old_data = old_data.merge(
#     nfi_data[["x", "y", "first_year", "idp"]].drop_duplicates(),
#     on=["x", "y", "first_year"],
#     how="left",
# )
# old_data

In [53]:
# # Save data with idp attached to it for later merging of predictor data and nfi data
# old_data.to_csv("nfi_final_sites_with_idp.csv", index=False)

# # Save data without idp attached for faster download and extraction of predictor data
# old_data[["first_year", "SiteID", "x", "y"]].drop_duplicates().to_csv(
#     "nfi_final_sites_without_idp.csv", index=False
# )

In [54]:
# pd.read_csv("nfi_final_sites_with_idp.csv")

In [55]:
# pd.read_csv("nfi_final_sites_without_idp.csv")