In [2]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from geopandas.tools import sjoin
import math
import numpy as np
import concurrent.futures
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

# Convert dfTreesDRP to a GeoDataFrame
gdf = gpd.GeoDataFrame(dfTreesDRP, geometry=gpd.points_from_xy(dfTreesDRP.decimalLongitude, dfTreesDRP.decimalLatitude))

# Create a spatial index
sindex = gdf.sindex

In [3]:
def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c


def check_byCoord(row, precision=120):
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Create a buffer around the point
    point = Point(lon1, lat1)
    buffer = point.buffer(precision)
    
    # Use the spatial index to find points within the buffer
    possible_matches_index = list(sindex.intersection(buffer.bounds))
    possible_matches = gdf.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.intersects(buffer)]
    
    # If there are no close points, return the existing scientificNames or an empty string
    if precise_matches.empty:
        return row['scientificNames'] if 'scientificNames' in row and not pd.isna(row['scientificNames']) else ''
    
    # Get the unique 'scien_yeartificName' values of the 5 closest points
    precise_matches['distance'] = precise_matches.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    close_points_df = precise_matches.sort_values('distance').iloc[:5]
    unique_names = close_points_df['scientificName'].unique()
    
    # If 'scientificNames' exists and is not NaN or empty, append new unique names to it
    if 'scientificNames' in row and not pd.isna(row['scientificNames']) and row['scientificNames'] != '':
        existing_names = row['scientificNames'].split('; ')
        new_names = [name for name in unique_names if name not in existing_names]
        return row['scientificNames'] + '; ' + '; '.join(new_names)
    else:
        return '; '.join(unique_names)

In [None]:
_year = 2023
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_byCoord, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')

    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality2/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)


print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

In [5]:
_year = 2023
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')


def process_chunk(chunk):
    chunk['scientificNames'] = chunk.apply(check_byCoord, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')
    return chunk

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use the executor to map the function to the chunks
    chunks = list(executor.map(process_chunk, chunks))

# Concatenate the chunks back into a single DataFrame
dfFires = pd.concat(chunks)

  return bound(*args, **kwds)


In [1]:
import concurrent.futures

def process_row(row):
    return check_byCoord(row)

def process_chunk(chunk):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        chunk['scientificNames'] = list(executor.map(process_row, chunk.to_dict('records')))
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Process each chunk sequentially
processed_chunks = [process_chunk(chunk) for chunk in chunks]

# Concatenate the processed chunks back into a single DataFrame
dfFires = pd.concat(processed_chunks)

NameError: name 'np' is not defined