In [1]:
import pandas as pd
import math
import numpy as np
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

In [2]:
def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters


def check_locality(row):
    parish = str(row['parish']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) & 
                              (dfTreesDRP['stateProvince'].str.lower() == district)]['scientificName'].unique()
    return '; '.join(unique_names)


def check_locality2(row):
    concelho = str(row['municipality']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(concelho, na=False)) & 
                              (dfTreesDRP['stateProvince'].str.lower().str.contains(district, na=False))]['scientificName'].unique()
    
    if 'scientificNames' in row and not pd.isna(row['scientificNames']) and row['scientificNames'] != '':
        existing_names = row['scientificNames'].split('; ')
        new_names = [name for name in unique_names if name not in existing_names]
        return row['scientificNames'] + '; ' + '; '.join(new_names)
    else:
        return '; '.join(unique_names)

def check_district(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values
    unique_names = close_points_df['scientificName'].unique()
    
    return '; '.join(unique_names)




def check_districtCoordSingular(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty list
    if close_points_df.empty:
        return []
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the 5 closest points
    closest_points = close_points_df.iloc[:5][['scientificName', 'distance']].apply(tuple, axis=1).tolist()
    
    return closest_points


def check_districtCoord(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)


from scipy.spatial import KDTree

# Create a KDTree from the coordinates in dfTreesDRP
tree = KDTree(dfTreesDRP[['decimalLatitude', 'decimalLongitude']])

def check_byCoord(row, precision=120):
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Query the tree for the indices of the points within 'precision' distance
    indices = tree.query_ball_point([lat1, lon1], precision)
    
    # If there are no close points, return the existing scientificNames or an empty string
    if not indices:
        return row['scientificNames'] if 'scientificNames' in row and not pd.isna(row['scientificNames']) else ''
    
    # Get the unique 'scientificName' values of the 5 closest points
    close_points_df = dfTreesDRP.iloc[indices]
    close_points_df['distance'] = close_points_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    close_points_df = close_points_df.sort_values('distance').iloc[:5]
    unique_names = close_points_df['scientificName'].unique()
    
    # If 'scientificNames' exists and is not NaN or empty, append new unique names to it
    if 'scientificNames' in row and not pd.isna(row['scientificNames']) and row['scientificNames'] != '':
        existing_names = row['scientificNames'].split('; ')
        new_names = [name for name in unique_names if name not in existing_names]
        return row['scientificNames'] + '; ' + '; '.join(new_names)
    else:
        return '; '.join(unique_names)


def checkNearestPoint(row):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]

    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name, nearest_distance


def checkNearestPointCoord(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    
    # Calculate the distance for each row in the filtered DataFrame
    dfTreesDRP['distance'] = dfTreesDRP.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = dfTreesDRP[dfTreesDRP['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)

In [35]:
_year = 2022
dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
print(len(dfFires))


num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')
from geopandas.tools import sjoin
    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

4040


  return bound(*args, **kwds)


Empty count in chunk 0: 200
Non empty count in chunk 0: 2


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 1: 195
Non empty count in chunk 1: 7


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 2: 201
Non empty count in chunk 2: 1


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 3: 135
Non empty count in chunk 3: 67


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 4: 105
Non empty count in chunk 4: 97


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 5: 107
Non empty count in chunk 5: 95


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 6: 158
Non empty count in chunk 6: 44


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 7: 168
Non empty count in chunk 7: 34


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 8: 165
Non empty count in chunk 8: 37


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 9: 148
Non empty count in chunk 9: 54


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 10: 154
Non empty count in chunk 10: 48


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 11: 162
Non empty count in chunk 11: 40


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 12: 151
Non empty count in chunk 12: 51


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 13: 157
Non empty count in chunk 13: 45


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 14: 156
Non empty count in chunk 14: 46


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 15: 158
Non empty count in chunk 15: 44


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 16: 148
Non empty count in chunk 16: 54


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 17: 169
Non empty count in chunk 17: 33


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 18: 180
Non empty count in chunk 18: 22


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 19: 176
Non empty count in chunk 19: 26
Locality
3193
847


In [36]:
dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

In [37]:
#1916
#583

_year = 2022
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_locality2, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')

    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality2/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

  return bound(*args, **kwds)


Empty count in chunk 0: 90
Non empty count in chunk 0: 35
Empty count in chunk 1: 85
Non empty count in chunk 1: 40
Empty count in chunk 2: 91
Non empty count in chunk 2: 34
Empty count in chunk 3: 85
Non empty count in chunk 3: 40
Empty count in chunk 4: 85
Non empty count in chunk 4: 40
Empty count in chunk 5: 102
Non empty count in chunk 5: 23


KeyboardInterrupt: 

In [None]:
dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

In [38]:
for _year in range(2013, 2023):
    dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
    dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
    print(len(dfFires))


    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

        chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality/{_year}_chunk_{i}.csv', index=False)



    dfFires = pd.concat(chunks)



    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)


    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality2, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

        chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality2/{_year}_chunk_{i}.csv', index=False)



    dfFires = pd.concat(chunks)



    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

11899


  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 0: 395
Non empty count in chunk 0: 200


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 1: 470
Non empty count in chunk 1: 125


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 2: 562
Non empty count in chunk 2: 33


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 3: 579
Non empty count in chunk 3: 16


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 4: 502
Non empty count in chunk 4: 93


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 5: 461
Non empty count in chunk 5: 134


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 6: 401
Non empty count in chunk 6: 194


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 7: 391
Non empty count in chunk 7: 204


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 8: 388
Non empty count in chunk 8: 207


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 9: 441
Non empty count in chunk 9: 154


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 10: 424
Non empty count in chunk 10: 171


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 11: 423
Non empty count in chunk 11: 172


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 12: 540
Non empty count in chunk 12: 55


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 13: 548
Non empty count in chunk 13: 47


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 14: 391
Non empty count in chunk 14: 204


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 15: 183
Non empty count in chunk 15: 412


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 16: 346
Non empty count in chunk 16: 249


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 17: 278
Non empty count in chunk 17: 317


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 18: 552
Non empty count in chunk 18: 43


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 19: 568
Non empty count in chunk 19: 26
Locality
8843
3056


  return bound(*args, **kwds)


Empty count in chunk 0: 383
Non empty count in chunk 0: 212
Empty count in chunk 1: 450
Non empty count in chunk 1: 145
Empty count in chunk 2: 562
Non empty count in chunk 2: 33
Empty count in chunk 3: 579
Non empty count in chunk 3: 16
Empty count in chunk 4: 466
Non empty count in chunk 4: 129
Empty count in chunk 5: 458
Non empty count in chunk 5: 137
Empty count in chunk 6: 400
Non empty count in chunk 6: 195
Empty count in chunk 7: 390
Non empty count in chunk 7: 205
Empty count in chunk 8: 387
Non empty count in chunk 8: 208
Empty count in chunk 9: 440
Non empty count in chunk 9: 155
Empty count in chunk 10: 416
Non empty count in chunk 10: 179
Empty count in chunk 11: 411
Non empty count in chunk 11: 184
Empty count in chunk 12: 540
Non empty count in chunk 12: 55
Empty count in chunk 13: 548
Non empty count in chunk 13: 47
Empty count in chunk 14: 391
Non empty count in chunk 14: 204
Empty count in chunk 15: 182
Non empty count in chunk 15: 413
Empty count in chunk 16: 343
Non

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 0: 116
Non empty count in chunk 0: 76


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 1: 158
Non empty count in chunk 1: 34


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 2: 170
Non empty count in chunk 2: 22


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 3: 187
Non empty count in chunk 3: 5


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 4: 165
Non empty count in chunk 4: 27


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 5: 155
Non empty count in chunk 5: 37


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 6: 164
Non empty count in chunk 6: 28


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 7: 147
Non empty count in chunk 7: 45


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 8: 161
Non empty count in chunk 8: 31


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 9: 160
Non empty count in chunk 9: 32


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 10: 183
Non empty count in chunk 10: 9


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 11: 175
Non empty count in chunk 11: 17


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 12: 87
Non empty count in chunk 12: 105


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 13: 69
Non empty count in chunk 13: 122


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 14: 84
Non empty count in chunk 14: 107


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 15: 127
Non empty count in chunk 15: 64


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 16: 58
Non empty count in chunk 16: 133


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 17: 105
Non empty count in chunk 17: 86


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 18: 164
Non empty count in chunk 18: 27


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 19: 179
Non empty count in chunk 19: 12
Locality
2814
1019


  return bound(*args, **kwds)


Empty count in chunk 0: 109
Non empty count in chunk 0: 83
Empty count in chunk 1: 154
Non empty count in chunk 1: 38
Empty count in chunk 2: 170
Non empty count in chunk 2: 22
Empty count in chunk 3: 179
Non empty count in chunk 3: 13
Empty count in chunk 4: 153
Non empty count in chunk 4: 39
Empty count in chunk 5: 154
Non empty count in chunk 5: 38
Empty count in chunk 6: 145
Non empty count in chunk 6: 47
Empty count in chunk 7: 132
Non empty count in chunk 7: 60
Empty count in chunk 8: 155
Non empty count in chunk 8: 37
Empty count in chunk 9: 151
Non empty count in chunk 9: 41
Empty count in chunk 10: 183
Non empty count in chunk 10: 9
Empty count in chunk 11: 175
Non empty count in chunk 11: 17


In [33]:
#1732
#767


_year = 2023
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

print("here")

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_byCoord, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')

    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_byCoord/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

  return bound(*args, **kwds)


here
Empty count in chunk 0: 0
Non empty count in chunk 0: 125


KeyboardInterrupt: 

In [None]:
def process_year(_year):
    dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
    dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
    print(len(dfFires))

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

    dfFires = pd.concat(chunks)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

if __name__ == "__main__":
    with Pool() as p:
        p.map(process_year, [2023, 2024])

In [3]:
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def process_chunkv1(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

def process_chunkv2(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality2, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

for _year in range(2021, 2023):

    if ((_year != 2022) or (_year != 2021)):

        dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
        num_chunks = 20

        # Split the DataFrame into smaller chunks
        chunks = np.array_split(dfFires, num_chunks)

        # Create a ProcessPoolExecutor
        with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
            # Use the executor to map the function to the chunks
            chunks = list(executor.map(process_chunkv1, chunks))

        # Concatenate the chunks back into a single DataFrame
        dfFires = pd.concat(chunks)

        dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

        print("Locality")
        empty_count = (dfFires['scientificNames'] == '').sum()
        print(empty_count)

        non_empty_count = (dfFires['scientificNames'] != '').sum()
        print(non_empty_count)

    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Use the executor to map the function to the chunks
        chunks = list(executor.map(process_chunkv2, chunks))

    dfFires = pd.concat(chunks)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 370
Non empty count in chunk: 19
Empty count in chunk: 316
Non empty count in chunk: 73
Empty count in chunk: 274
Non empty count in chunk: 116


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 359
Non empty count in chunk: 30


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 310
Non empty count in chunk: 80


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 273
Non empty count in chunk: 116
Empty count in chunk: 268
Non empty count in chunk: 121
Empty count in chunk: 274
Non empty count in chunk: 115
Empty count in chunk: 283
Non empty count in chunk: 106


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 309
Non empty count in chunk: 80


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 272
Non empty count in chunk: 117


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 305
Non empty count in chunk: 84
Empty count in chunk: 315
Non empty count in chunk: 74
Empty count in chunk: 372
Non empty count in chunk: 17
Empty count in chunk: 355
Non empty count in chunk: 34


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 379
Non empty count in chunk: 10
Empty count in chunk: 230
Non empty count in chunk: 159
Empty count in chunk: 217
Non empty count in chunk: 172
Empty count in chunk: 331
Non empty count in chunk: 58
Empty count in chunk: 358
Non empty count in chunk: 31
Locality
6170
1612


  return bound(*args, **kwds)


Empty count in chunk: 286
Non empty count in chunk: 103
Empty count in chunk: 266
Non empty count in chunk: 124
Empty count in chunk: 300
Non empty count in chunk: 90
Empty count in chunk: 359
Non empty count in chunk: 30
Empty count in chunk: 343
Non empty count in chunk: 46
Empty count in chunk: 244
Non empty count in chunk: 145
Empty count in chunk: 250
Non empty count in chunk: 139
Empty count in chunk: 256
Non empty count in chunk: 133
Empty count in chunk: 275
Non empty count in chunk: 114
Empty count in chunk: 249
Non empty count in chunk: 140
Empty count in chunk: 262
Non empty count in chunk: 127
Empty count in chunk: 282
Non empty count in chunk: 107
Empty count in chunk: 262
Non empty count in chunk: 127
Empty count in chunk: 355
Non empty count in chunk: 34
Empty count in chunk: 372
Non empty count in chunk: 17
Empty count in chunk: 230
Non empty count in chunk: 159
Empty count in chunk: 379
Non empty count in chunk: 10
Empty count in chunk: 214
Non empty count in chunk: 17

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 441
Non empty count in chunk: 80
Empty count in chunk: 396
Non empty count in chunk: 125
Empty count in chunk: 492
Non empty count in chunk: 29


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 500
Non empty count in chunk: 21


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 328
Non empty count in chunk: 193


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 447
Non empty count in chunk: 74
Empty count in chunk: 431
Non empty count in chunk: 90


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 412
Non empty count in chunk: 109
Empty count in chunk: 380
Non empty count in chunk: 141


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 389
Non empty count in chunk: 132


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 373
Non empty count in chunk: 148
Empty count in chunk: 362
Non empty count in chunk: 159


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 426
Non empty count in chunk: 94
Empty count in chunk: 490
Non empty count in chunk: 30


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 423
Non empty count in chunk: 97


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 499
Non empty count in chunk: 21
Empty count in chunk: 334
Non empty count in chunk: 186
Empty count in chunk: 276
Non empty count in chunk: 244
Empty count in chunk: 405
Non empty count in chunk: 115
Empty count in chunk: 489
Non empty count in chunk: 31
Locality
8293
2119


  return bound(*args, **kwds)


Empty count in chunk: 322
Non empty count in chunk: 199
Empty count in chunk: 484
Non empty count in chunk: 37
Empty count in chunk: 492
Non empty count in chunk: 29
Empty count in chunk: 380
Non empty count in chunk: 141
Empty count in chunk: 430
Non empty count in chunk: 91
Empty count in chunk: 369
Non empty count in chunk: 152
Empty count in chunk: 320
Non empty count in chunk: 201
Empty count in chunk: 370
Non empty count in chunk: 151
Empty count in chunk: 348
Non empty count in chunk: 173
Empty count in chunk: 385
Non empty count in chunk: 136
Empty count in chunk: 402
Non empty count in chunk: 118
Empty count in chunk: 320
Non empty count in chunk: 201
Empty count in chunk: 333
Non empty count in chunk: 188
Empty count in chunk: 380
Non empty count in chunk: 140
Empty count in chunk: 490
Non empty count in chunk: 30
Empty count in chunk: 330
Non empty count in chunk: 190
Empty count in chunk: 275
Non empty count in chunk: 245
Empty count in chunk: 499
Non empty count in chunk: 

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 268
Non empty count in chunk: 10
Empty count in chunk: 257
Non empty count in chunk: 21
Empty count in chunk: 216
Non empty count in chunk: 62
Empty count in chunk: 255
Non empty count in chunk: 23
Empty count in chunk: 162
Non empty count in chunk: 116


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 226
Non empty count in chunk: 52
Empty count in chunk: 228
Non empty count in chunk: 50
Empty count in chunk: 211
Non empty count in chunk: 67
Empty count in chunk: 214
Non empty count in chunk: 64


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 198
Non empty count in chunk: 80


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 225
Non empty count in chunk: 53
Empty count in chunk: 265
Non empty count in chunk: 13
Empty count in chunk: 205
Non empty count in chunk: 73
Empty count in chunk: 232
Non empty count in chunk: 46


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 271
Non empty count in chunk: 7


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 231
Non empty count in chunk: 47
Empty count in chunk: 114Empty count in chunk: 133

Non empty count in chunk: 164Non empty count in chunk: 145

Empty count in chunk: 215
Non empty count in chunk: 63
Empty count in chunk: 268
Non empty count in chunk: 9
Locality
4394
1165


  return bound(*args, **kwds)


Empty count in chunk: 151
Non empty count in chunk: 127
Empty count in chunk: 255
Non empty count in chunk: 23
Empty count in chunk: 231
Non empty count in chunk: 47
Empty count in chunk: 206
Non empty count in chunk: 72
Empty count in chunk: 257
Non empty count in chunk: 21
Empty count in chunk: 188
Non empty count in chunk: 90
Empty count in chunk: 165
Non empty count in chunk: 113
Empty count in chunk: 161
Non empty count in chunk: 117
Empty count in chunk: 157
Non empty count in chunk: 121
Empty count in chunk: 186
Non empty count in chunk: 92
Empty count in chunk: 225
Non empty count in chunk: 53
Empty count in chunk: 160
Non empty count in chunk: 118
Empty count in chunk: 265
Non empty count in chunk: 13
Empty count in chunk: 182
Non empty count in chunk: 96
Empty count in chunk: 271
Non empty count in chunk: 7
Empty count in chunk: 229
Non empty count in chunk: 49
Empty count in chunk: 107
Non empty count in chunk: 171
Empty count in chunk: 148
Non empty count in chunk: 130
Empt

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 191
Non empty count in chunk: 11
Empty count in chunk: 123
Non empty count in chunk: 79
Empty count in chunk: 126
Non empty count in chunk: 76
Empty count in chunk: 171
Non empty count in chunk: 31
Empty count in chunk: 195
Non empty count in chunk: 7


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 154
Non empty count in chunk: 48
Empty count in chunk: 176
Non empty count in chunk: 26


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 168
Non empty count in chunk: 34
Empty count in chunk: 154
Non empty count in chunk: 48
Empty count in chunk: 159
Non empty count in chunk: 43


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 168
Non empty count in chunk: 34
Empty count in chunk: 185
Non empty count in chunk: 17
Empty count in chunk: 174
Non empty count in chunk: 28


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 159
Non empty count in chunk: 43
Empty count in chunk: 202
Non empty count in chunk: 0


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 84
Non empty count in chunk: 118
Empty count in chunk: 95
Non empty count in chunk: 107
Empty count in chunk: 132
Non empty count in chunk: 70
Empty count in chunk: 136
Non empty count in chunk: 66
Empty count in chunk: 197
Non empty count in chunk: 5
Locality
3149
891


  return bound(*args, **kwds)


Empty count in chunk: 195
Non empty count in chunk: 7
Empty count in chunk: 176
Non empty count in chunk: 26
Empty count in chunk: 121
Non empty count in chunk: 81
Empty count in chunk: 171
Non empty count in chunk: 31
Empty count in chunk: 122
Non empty count in chunk: 80
Empty count in chunk: 158
Non empty count in chunk: 44
Empty count in chunk: 111
Non empty count in chunk: 91
Empty count in chunk: 140
Non empty count in chunk: 62
Empty count in chunk: 112
Non empty count in chunk: 90
Empty count in chunk: 122
Non empty count in chunk: 80
Empty count in chunk: 185
Non empty count in chunk: 17
Empty count in chunk: 129
Non empty count in chunk: 73
Empty count in chunk: 157
Non empty count in chunk: 45
Empty count in chunk: 134
Non empty count in chunk: 68
Empty count in chunk: 202
Non empty count in chunk: 0
Empty count in chunk: 131
Non empty count in chunk: 71
Empty count in chunk: 93
Non empty count in chunk: 109
Empty count in chunk: 82
Non empty count in chunk: 120
Empty count 

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 128
Non empty count in chunk: 70
Empty count in chunk: 173
Non empty count in chunk: 25
Empty count in chunk: 186
Non empty count in chunk: 12
Empty count in chunk: 174
Non empty count in chunk: 24
Empty count in chunk: 167
Non empty count in chunk: 31


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 178
Non empty count in chunk: 20
Empty count in chunk: 166
Non empty count in chunk: 32
Empty count in chunk: 174
Non empty count in chunk: 24


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 163
Non empty count in chunk: 35
Empty count in chunk: 179
Non empty count in chunk: 19


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 170
Non empty count in chunk: 28
Empty count in chunk: 150
Non empty count in chunk: 48


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 153
Non empty count in chunk: 45


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 178
Non empty count in chunk: 19
Empty count in chunk: 162
Non empty count in chunk: 35


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 143
Non empty count in chunk: 54
Empty count in chunk: 195
Non empty count in chunk: 2
Empty count in chunk: 99
Non empty count in chunk: 98
Empty count in chunk: 188
Non empty count in chunk: 9
Empty count in chunk: 144
Non empty count in chunk: 53
Locality
3270
683


  return bound(*args, **kwds)


Empty count in chunk: 126
Non empty count in chunk: 72
Empty count in chunk: 129
Non empty count in chunk: 69
Empty count in chunk: 160
Non empty count in chunk: 38
Empty count in chunk: 185
Non empty count in chunk: 13
Empty count in chunk: 116
Non empty count in chunk: 82
Empty count in chunk: 116
Non empty count in chunk: 82
Empty count in chunk: 100
Non empty count in chunk: 98
Empty count in chunk: 113
Non empty count in chunk: 85
Empty count in chunk: 113
Non empty count in chunk: 85
Empty count in chunk: 130
Non empty count in chunk: 68
Empty count in chunk: 132
Non empty count in chunk: 66
Empty count in chunk: 130
Non empty count in chunk: 68
Empty count in chunk: 178
Non empty count in chunk: 19
Empty count in chunk: 132
Non empty count in chunk: 66
Empty count in chunk: 141
Non empty count in chunk: 56
Empty count in chunk: 181
Non empty count in chunk: 16
Empty count in chunk: 195
Non empty count in chunk: 2
Empty count in chunk: 120
Non empty count in chunk: 77
Empty count

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 93
Non empty count in chunk: 38
Empty count in chunk: 130
Non empty count in chunk: 1
Empty count in chunk: 78
Non empty count in chunk: 53
Empty count in chunk: 114
Non empty count in chunk: 17
Empty count in chunk: 107
Non empty count in chunk: 24


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 90
Non empty count in chunk: 41
Empty count in chunk: 101
Non empty count in chunk: 30
Empty count in chunk: 89
Non empty count in chunk: 42
Empty count in chunk: 94
Non empty count in chunk: 37
Empty count in chunk: 99
Non empty count in chunk: 32


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 83
Non empty count in chunk: 47
Empty count in chunk: 118
Non empty count in chunk: 12
Empty count in chunk: 121
Non empty count in chunk: 9
Empty count in chunk: 94
Non empty count in chunk: 36
Empty count in chunk: 69
Non empty count in chunk: 61


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 71
Non empty count in chunk: 59
Empty count in chunk: 70
Non empty count in chunk: 60
Empty count in chunk: 106
Non empty count in chunk: 24
Empty count in chunk: 72
Non empty count in chunk: 58
Empty count in chunk: 121
Non empty count in chunk: 9
Locality
1920
690


  return bound(*args, **kwds)


In [None]:
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def process_chunkv1(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

def process_chunkv2(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality2, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

for _year in range(2001, 2013):

    if _year != 2022:

        dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
        num_chunks = 20

        # Split the DataFrame into smaller chunks
        chunks = np.array_split(dfFires, num_chunks)

        # Create a ProcessPoolExecutor
        with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
            # Use the executor to map the function to the chunks
            chunks = list(executor.map(process_chunkv1, chunks))

        # Concatenate the chunks back into a single DataFrame
        dfFires = pd.concat(chunks)

        dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

        print("Locality")
        empty_count = (dfFires['scientificNames'] == '').sum()
        print(empty_count)

        non_empty_count = (dfFires['scientificNames'] != '').sum()
        print(non_empty_count)

    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Use the executor to map the function to the chunks
        chunks = list(executor.map(process_chunkv2, chunks))

    dfFires = pd.concat(chunks)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)