In [5]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import requests
import os
from shapely.geometry import Point

# === Configuration ===
INPUT_CSV = "C:\\Users\\mdcoe\\thesis-orchids\\data\\gbif\\epidendrum\\occurrences\\occurrences_final.csv"
OUTPUT_CSV = "C:\\Users\\mdcoe\\thesis-orchids\\data\\gbif\\epidendrum\\occurrences\\occurrences_flagged.csv"
MAPS_FOLDER = "C:\\Users\\mdcoe\\thesis-orchids\\data\\gbif\\epidendrum\\species_maps"
WORLD_SHP = "C:\\Users\\mdcoe\\thesis-orchids\\data\\scripts\\ne_110m_admin_0_countries\\ne_110m_admin_0_countries.shp" 

# Create maps folder if needed
os.makedirs(MAPS_FOLDER, exist_ok=True)

# === Load occurrence data ===
df = pd.read_csv(INPUT_CSV)
df = df.dropna(subset=['decimalLatitude', 'decimalLongitude', 'species', 'country'])

# === Function to get known countries from GBIF ===
def get_gbif_countries(species_name):
    url = f"https://api.gbif.org/v1/occurrence/search"
    params = {
        "scientificName": species_name,
        "limit": 0,
        "facet": "country",
        "facetLimit": 200
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        countries = [f['name'] for f in data.get('facets', [])[0].get('counts', [])]
        return set(countries)
    except Exception as e:
        print(f"❌ Error fetching GBIF data for {species_name}: {e}")
        return set()

# === Flag suspicious records ===
def flag_suspicious_records(df):
    flagged = []
    species_list = df['species'].dropna().unique()
    species_country_map = {}

    for species in species_list:
        print(f"\n🔎 Checking species: {species}")
        known_countries = get_gbif_countries(species)
        species_country_map[species] = known_countries
        df_species = df[df['species'] == species]
        
        for idx, row in df_species.iterrows():
            country = row['country']
            basis = str(row.get('basisOfRecord', '')).lower()
            is_suspicious = False
            reason = ''

            if known_countries and country not in known_countries:
                is_suspicious = True
                if 'preserved specimen' in basis:
                    reason = 'Outside known range (Preserved Specimen)'
                else:
                    reason = 'Outside known range'
            
            flagged.append((idx, is_suspicious, reason))

        print(f"✅ Flagged {sum(df_species['country'].isin(known_countries) == False)} suspicious occurrences")

    # Apply flags
    for idx, flag, reason in flagged:
        df.loc[idx, 'suspicious'] = flag
        df.loc[idx, 'suspicious_reason'] = reason

    return df, species_country_map

# === Generate maps per species ===
def generate_species_maps(df, species_country_map):
    world = gpd.read_file(WORLD_SHP)

    for species, known_countries in species_country_map.items():
        df_sp = df[df['species'] == species]
        if df_sp.empty:
            print(f"⚠️ No occurrences found for {species} — skipping.")
            continue

        gdf = gpd.GeoDataFrame(
            df_sp,
            geometry=gpd.points_from_xy(df_sp.decimalLongitude, df_sp.decimalLatitude),
            crs="EPSG:4326"
        )
        suspicious_gdf = gdf[gdf['suspicious'] == True]

        fig, ax = plt.subplots(figsize=(12, 8))
        base = world.plot(ax=ax, color='lightgrey', edgecolor='black')

        # Plot GBIF known countries
        if known_countries:
            country_shapes = world[world['ADMIN'].isin(known_countries)]
            if not country_shapes.empty:
                country_shapes.plot(ax=base, color='lightgreen', alpha=0.5, label='GBIF Known Range')

        # Plot all records
        gdf.plot(ax=base, color='blue', markersize=10, label='All Occurrences')

        # Highlight suspicious
        if not suspicious_gdf.empty:
            suspicious_gdf.plot(ax=base, color='red', markersize=20, label='Suspicious')

        ax.set_title(f'{species} – Occurrences and Known Range')
        ax.axis('off')
        plt.legend()
        plt.tight_layout()

        map_path = os.path.join(MAPS_FOLDER, f"{species.replace(' ', '_')}.png")
        plt.savefig(map_path, dpi=300)
        plt.close()
        print(f"🗺️ Map saved: {map_path}")

# === Run full pipeline ===
if __name__ == "__main__":
    df_flagged, species_country_map = flag_suspicious_records(df)
    df_flagged.to_csv(OUTPUT_CSV, index=False)
    generate_species_maps(df_flagged, species_country_map)
    print(f"\n✅ Done! Output saved to:\n- CSV: {OUTPUT_CSV}\n- Maps: {MAPS_FOLDER}/")


🔎 Checking species: Epidendrum pseudavicula
✅ Flagged 19 suspicious occurrences

🔎 Checking species: Epidendrum rigidum
✅ Flagged 1049 suspicious occurrences

🔎 Checking species: Epidendrum durum
✅ Flagged 37 suspicious occurrences

🔎 Checking species: Epidendrum secundum
✅ Flagged 1762 suspicious occurrences

🔎 Checking species: Epidendrum vesicatum
✅ Flagged 45 suspicious occurrences

🔎 Checking species: Epidendrum dipus
✅ Flagged 8 suspicious occurrences

🔎 Checking species: Epidendrum caldense
✅ Flagged 15 suspicious occurrences

🔎 Checking species: Epidendrum loefgrenii
✅ Flagged 2 suspicious occurrences

🔎 Checking species: Epidendrum densiflorum
✅ Flagged 159 suspicious occurrences

🔎 Checking species: Epidendrum strobiliferum
✅ Flagged 357 suspicious occurrences

🔎 Checking species: Epidendrum erectum
✅ Flagged 8 suspicious occurrences

🔎 Checking species: Epidendrum megaloclinium
✅ Flagged 13 suspicious occurrences

🔎 Checking species: Epidendrum ibaguense
✅ Flagged 291 suspi