In [1]:
import pandas as pd
import math
import numpy as np
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

In [2]:
def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters


def check_locality(row):
    parish = str(row['parish']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) & 
                              (dfTreesDRP['stateProvince'].str.lower() == district)]['scientificName'].unique()
    return '; '.join(unique_names)


def check_locality2(row):
    concelho = str(row['municipality']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(concelho, na=False)) & 
                              (dfTreesDRP['stateProvince'].str.lower().str.contains(district, na=False))]['scientificName'].unique()
    
    if 'scientificNames' in row and not pd.isna(row['scientificNames']) and row['scientificNames'] != '':
        existing_names = row['scientificNames'].split('; ')
        new_names = [name for name in unique_names if name not in existing_names]
        return row['scientificNames'] + '; ' + '; '.join(new_names)
    else:
        return '; '.join(unique_names)

def check_district(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values
    unique_names = close_points_df['scientificName'].unique()
    
    return '; '.join(unique_names)




def check_districtCoordSingular(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty list
    if close_points_df.empty:
        return []
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the 5 closest points
    closest_points = close_points_df.iloc[:5][['scientificName', 'distance']].apply(tuple, axis=1).tolist()
    
    return closest_points


def check_districtCoord(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)


from scipy.spatial import KDTree

# Create a KDTree from the coordinates in dfTreesDRP
tree = KDTree(dfTreesDRP[['decimalLatitude', 'decimalLongitude']])

def check_byCoord(row, precision=120):
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Query the tree for the indices of the points within 'precision' distance
    indices = tree.query_ball_point([lat1, lon1], precision)
    
    # If there are no close points, return the existing scientificNames or an empty string
    if not indices:
        return row['scientificNames'] if 'scientificNames' in row and not pd.isna(row['scientificNames']) else ''
    
    # Get the unique 'scientificName' values of the 5 closest points
    close_points_df = dfTreesDRP.iloc[indices]
    close_points_df['distance'] = close_points_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    close_points_df = close_points_df.sort_values('distance').iloc[:5]
    unique_names = close_points_df['scientificName'].unique()
    
    # If 'scientificNames' exists and is not NaN or empty, append new unique names to it
    if 'scientificNames' in row and not pd.isna(row['scientificNames']) and row['scientificNames'] != '':
        existing_names = row['scientificNames'].split('; ')
        new_names = [name for name in unique_names if name not in existing_names]
        return row['scientificNames'] + '; ' + '; '.join(new_names)
    else:
        return '; '.join(unique_names)


def checkNearestPoint(row):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]

    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name, nearest_distance


def checkNearestPointCoord(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    
    # Calculate the distance for each row in the filtered DataFrame
    dfTreesDRP['distance'] = dfTreesDRP.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = dfTreesDRP[dfTreesDRP['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)

In [35]:
_year = 2022
dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
print(len(dfFires))


num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')
from geopandas.tools import sjoin
    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

4040


  return bound(*args, **kwds)


Empty count in chunk 0: 200
Non empty count in chunk 0: 2


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 1: 195
Non empty count in chunk 1: 7


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 2: 201
Non empty count in chunk 2: 1


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 3: 135
Non empty count in chunk 3: 67


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 4: 105
Non empty count in chunk 4: 97


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 5: 107
Non empty count in chunk 5: 95


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 6: 158
Non empty count in chunk 6: 44


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 7: 168
Non empty count in chunk 7: 34


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 8: 165
Non empty count in chunk 8: 37


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 9: 148
Non empty count in chunk 9: 54


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 10: 154
Non empty count in chunk 10: 48


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 11: 162
Non empty count in chunk 11: 40


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 12: 151
Non empty count in chunk 12: 51


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 13: 157
Non empty count in chunk 13: 45


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 14: 156
Non empty count in chunk 14: 46


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 15: 158
Non empty count in chunk 15: 44


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 16: 148
Non empty count in chunk 16: 54


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 17: 169
Non empty count in chunk 17: 33


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 18: 180
Non empty count in chunk 18: 22


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk 19: 176
Non empty count in chunk 19: 26
Locality
3193
847


In [36]:
dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

In [37]:
#1916
#583

_year = 2022
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_locality2, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')

    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality2/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

  return bound(*args, **kwds)


Empty count in chunk 0: 90
Non empty count in chunk 0: 35
Empty count in chunk 1: 85
Non empty count in chunk 1: 40
Empty count in chunk 2: 91
Non empty count in chunk 2: 34
Empty count in chunk 3: 85
Non empty count in chunk 3: 40
Empty count in chunk 4: 85
Non empty count in chunk 4: 40
Empty count in chunk 5: 102
Non empty count in chunk 5: 23


KeyboardInterrupt: 

In [None]:
dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

In [3]:
for _year in range(2013, 2023):
    dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
    dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
    print(len(dfFires))


    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

        chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality/{_year}_chunk_{i}.csv', index=False)



    dfFires = pd.concat(chunks)



    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)


    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality2, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

        chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_locality2/{_year}_chunk_{i}.csv', index=False)



    dfFires = pd.concat(chunks)



    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

11899


  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


KeyboardInterrupt: 

In [33]:
#1732
#767


_year = 2023
#dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

num_chunks = 20

# Split the DataFrame into smaller chunks
chunks = np.array_split(dfFires, num_chunks)

print("here")

# Apply the function to each chunk
for i in range(num_chunks):
    chunks[i]['scientificNames'] = chunks[i].apply(check_byCoord, axis=1)
    empty_count = (chunks[i]['scientificNames'] == '').sum()
    print(f'Empty count in chunk {i}: {empty_count}')
    non_empty_count = (chunks[i]['scientificNames'] != '').sum()
    print(f'Non empty count in chunk {i}: {non_empty_count}')

    chunks[i].to_csv(f'DatasetWTrees/PreviousVersions/check_byCoord/{_year}_chunk_{i}.csv', index=False)



dfFires = pd.concat(chunks)



print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

  return bound(*args, **kwds)


here
Empty count in chunk 0: 0
Non empty count in chunk 0: 125


KeyboardInterrupt: 

In [None]:
def process_year(_year):
    dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
    dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
    print(len(dfFires))

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    # Apply the function to each chunk
    for i in range(num_chunks):
        chunks[i]['scientificNames'] = chunks[i].apply(check_locality, axis=1)
        empty_count = (chunks[i]['scientificNames'] == '').sum()
        print(f'Empty count in chunk {i}: {empty_count}')
        non_empty_count = (chunks[i]['scientificNames'] != '').sum()
        print(f'Non empty count in chunk {i}: {non_empty_count}')

    dfFires = pd.concat(chunks)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

if __name__ == "__main__":
    with Pool() as p:
        p.map(process_year, [2023, 2024])

In [4]:
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def process_chunkv1(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

def process_chunkv2(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality2, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

for _year in range(2021, 2023):

    if ((_year != 2022) or (_year != 2021)):

        dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
        num_chunks = 20

        # Split the DataFrame into smaller chunks
        chunks = np.array_split(dfFires, num_chunks)

        # Create a ProcessPoolExecutor
        with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
            # Use the executor to map the function to the chunks
            chunks = list(executor.map(process_chunkv1, chunks))

        # Concatenate the chunks back into a single DataFrame
        dfFires = pd.concat(chunks)

        dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

        print("Locality")
        empty_count = (dfFires['scientificNames'] == '').sum()
        print(empty_count)

        non_empty_count = (dfFires['scientificNames'] != '').sum()
        print(non_empty_count)

    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Use the executor to map the function to the chunks
        chunks = list(executor.map(process_chunkv2, chunks))

    dfFires = pd.concat(chunks)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)

  return bound(*args, **kwds)
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 93
Non empty count in chunk: 38
Empty count in chunk: 78
Non empty count in chunk: 53
Empty count in chunk: 107
Non empty count in chunk: 24


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 114
Non empty count in chunk: 17


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Empty count in chunk: 130
Non empty count in chunk: 1


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &
  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


In [None]:
import concurrent.futures

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def process_chunkv1(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

def process_chunkv2(chunk):
    chunk['scientificNames'] = chunk.apply(check_locality2, axis=1)
    empty_count = (chunk['scientificNames'] == '').sum()
    print(f'Empty count in chunk: {empty_count}')
    non_empty_count = (chunk['scientificNames'] != '').sum()
    print(f'Non empty count in chunk: {non_empty_count}')
    return chunk

for _year in range(2001, 2013):

    if _year != 2022:

        dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
        num_chunks = 20

        # Split the DataFrame into smaller chunks
        chunks = np.array_split(dfFires, num_chunks)

        # Create a ProcessPoolExecutor
        with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
            # Use the executor to map the function to the chunks
            chunks = list(executor.map(process_chunkv1, chunks))

        # Concatenate the chunks back into a single DataFrame
        dfFires = pd.concat(chunks)

        dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality.csv', index=False)

        print("Locality")
        empty_count = (dfFires['scientificNames'] == '').sum()
        print(empty_count)

        non_empty_count = (dfFires['scientificNames'] != '').sum()
        print(non_empty_count)

    dfFires = pd.read_csv(f"DatasetWTrees/PreviousVersions/{_year}_checklocality.csv")

    num_chunks = 20

    # Split the DataFrame into smaller chunks
    chunks = np.array_split(dfFires, num_chunks)

    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        # Use the executor to map the function to the chunks
        chunks = list(executor.map(process_chunkv2, chunks))

    dfFires = pd.concat(chunks)

    dfFires.to_csv(f'DatasetWTrees/PreviousVersions/{_year}_checklocality2.csv', index=False)

    print("Locality")
    empty_count = (dfFires['scientificNames'] == '').sum()
    print(empty_count)

    non_empty_count = (dfFires['scientificNames'] != '').sum()
    print(non_empty_count)