In [1]:
import pandas as pd
import math
import numpy as np
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters


def check_district(row, precision=1000):
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values, excluding NaN values
    unique_names = close_points_df['scientificName'].dropna().unique()

    # If 'scientificNames' exists in row and is not NaN, append unique names if they don't exist
    if 'scientificNames' in row and pd.notna(row['scientificNames']):
        existing_names = str(row['scientificNames']).split('; ')
        for name in unique_names:
            if name not in existing_names:
                existing_names.append(name)
        return '; '.join(existing_names)
    
    # If 'scientificNames' is NaN, just return the unique names
    return '; '.join(unique_names)

def process_chunkv1(chunk):
    chunk['scientificNames'] = chunk.apply(check_district, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

In [3]:
for _year in range(2002, 2023):
    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_namissing.csv")

    #num_chunks = 20

    print(len(dfFires))

    num_chunks = int(((1/200) * len(dfFires)))

    print(num_chunks)
    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Create a ProcessPoolExecutor
    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Use the executor to map the function to the chunks
        for i, chunk in enumerate(chunks):
            print(f"{_year} - Chunk: ", i)
            chunk = executor.submit(process_chunkv1, chunk).result()
            chunk.to_csv(f'DatasetWTrees/PreviousVersions/check_byCoord/{_year}_chunk_{i}.csv', index=False)
            chunks[i] = chunk

    # Concatenate the chunks back into a single DataFrame
    dfFires = pd.concat(chunks)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checkDistrict.csv', index=False)

    print("District")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

25982
129
2001 - Chunk:  0


  return bound(*args, **kwds)


Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  1
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  2
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  3
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  4
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  5
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  6
Empty count in chunk: 1
Non empty count in chunk: 201
2001 - Chunk:  7
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  8
Empty count in chunk: 1
Non empty count in chunk: 201
2001 - Chunk:  9
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  10
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  11
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  12
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  13
Empty count in chunk: 0
Non empty count in chunk: 202
2001 - Chunk:  14
E

  return bound(*args, **kwds)


Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  1
Empty count in chunk: 1
Non empty count in chunk: 200
2002 - Chunk:  2
Empty count in chunk: 1
Non empty count in chunk: 200
2002 - Chunk:  3
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  4
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  5
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  6
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  7
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  8
Empty count in chunk: 0
Non empty count in chunk: 201
2002 - Chunk:  9
