In [None]:
import pandas as pd
import math

In [None]:
# Remove columns that are not needed
dftrees = pd.read_csv('TreesPortugueseTerritory.csv', sep='\t', usecols=['scientificName', 'locality', 'stateProvince','occurrenceStatus','individualCount','decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters','coordinatePrecision','elevation','elevationAccuracy','depth','depthAccuracy'])
dftrees.to_csv('TreesPortugueseTerritoryDropped.csv', index=False)

In [None]:
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
print(dfTreesDRP['occurrenceStatus'].unique())
print(dfTreesDRP['individualCount'].unique())
print(dfTreesDRP['coordinatePrecision'].unique())
print(dfTreesDRP['elevation'].unique())

In [None]:
dfTreesDRP['stateProvince'] = dfTreesDRP['stateProvince'].replace('Bragança District', 'Bragança')
dfTreesDRP['locality'] = dfTreesDRP['locality'].replace('Ovadas e Panchora', 'Ovadas e Panchorra')

dfTreesDRP.to_csv('TreesPortugueseTerritoryDropped.csv', index=False)

In [83]:
import math

def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters


def check_locality(row):
    parish = str(row['parish']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) & 
                              (dfTreesDRP['stateProvince'] == district)]['scientificName'].unique()
    return '; '.join(unique_names)


def check_locality2(row):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    concelho = str(row['municipality']).lower()
    district = str(row['district']).lower()
    unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(concelho, na=False)) & 
                              (dfTreesDRP['stateProvince'].str.lower().str == district)]['scientificName'].unique()
    return '; '.join(unique_names)


def check_district(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values
    unique_names = close_points_df['scientificName'].unique()
    
    return '; '.join(unique_names)




def check_districtCoordSingular(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty list
    if close_points_df.empty:
        return []
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the 5 closest points
    closest_points = close_points_df.iloc[:5][['scientificName', 'distance']].apply(tuple, axis=1).tolist()
    
    return closest_points


def check_districtCoord(row, precision=500):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)


def checkNearestPoint(row):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower().str == district)]

    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name, nearest_distance


def checkNearestPointCoord(row, precision=120):
    if 'scientificNames' in row and row['scientificNames'] != '':
        return row['scientificNames']
    
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    
    # Calculate the distance for each row in the filtered DataFrame
    dfTreesDRP['distance'] = dfTreesDRP.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = dfTreesDRP[dfTreesDRP['distance'] < precision]
    
    # If there are no close points, return an empty string
    if close_points_df.empty:
        return ''
    
    # Sort the DataFrame by the 'distance' column
    close_points_df = close_points_df.sort_values('distance')
    
    # Get the unique 'scientificName' values of the 5 closest points
    unique_names = close_points_df.iloc[:5]['scientificName'].unique()
    
    return '; '.join(unique_names)

In [None]:
DistanceTwoPoints(51.5007, 0.1246, 40.6892, 74.0445)

In [None]:
#Faro,Olhão,Moncarapacho

original_lat, original_lon = 37.0787222226461, -7.8068333334393 

#decimalLatitude,decimalLongitude

dfTreesDRP['distance'] = dfTreesDRP.apply(lambda row: DistanceTwoPoints(original_lat, original_lon, row['decimalLatitude'], row['decimalLongitude']), axis=1)

dfNearby = dfTreesDRP[dfTreesDRP['distance'] <= 500]

In [None]:
print(dfNearby['scientificName'].unique())
print(dfNearby.head())
#search for locality 

In [84]:
_year = 2023
dfFires = pd.read_csv(f"Dataset/b{_year}.csv")
dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')
print(len(dfFires))

dfFires['scientificNames'] = dfFires.apply(check_locality, axis=1)

# dfFires.to_csv(f"v2ParishTreeSpeciesb{_year}.csv", index=False)

print("Locality")
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

print("Locality2")
dfFires['scientificNames'] = dfFires.apply(check_locality2, axis=1)
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

2499


  unique_names = dfTreesDRP[(dfTreesDRP['locality'].str.lower().str.contains(parish, na=False)) &


Locality
2499
0
Locality2


KeyboardInterrupt: 

In [55]:
dfFiresDistrict = dfFires.copy()
dfFiresCoord = dfFires.copy()


print("District")

dfFiresDistrict['scientificNames'] = dfFiresDistrict.apply(check_district, axis=1)
empty_count = (dfFiresDistrict['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFiresDistrict['scientificNames'] != '').sum()
print(non_empty_count)


print("Coordinates")

dfFiresCoord['scientificNames'] = dfFiresCoord.apply(checkNearestPointCoord, axis=1, precision=120)
empty_count = (dfFiresCoord['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFiresCoord['scientificNames'] != '').sum()
print(non_empty_count)

1761
738


In [60]:
dfFires['scientificNames'] = dfFires.apply(check_district, axis=1)
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

1481
1018


In [61]:
dfFires.to_csv(f"v3ParishTreeSpeciesb{_year}.csv", index=False)

In [66]:
vazio = dfFires.iloc[1]
print(vazio['scientificNames'])
check_districtCoordSingular(vazio, precision=300)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


[('Tracheophyta', 188.554610087213), ('Tracheophyta', 188.554610087213)]

In [70]:
dfFires['scientificNames'] = dfFires.apply(check_districtCoord, axis=1)
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

KeyboardInterrupt: 

In [72]:
dfFires.to_csv(f"v4ParishTreeSpeciesb{_year}.csv", index=False)

In [73]:
dfFires['scientificNames'] = dfFires.apply(check_districtCoord, axis=1, precision=1000)
empty_count = (dfFires['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (dfFires['scientificNames'] != '').sum()
print(non_empty_count)

1
2498


In [78]:
empty_scientificNames = dfFires[dfFires['scientificNames'] == '']
print(empty_scientificNames)
checkNearestPoint(empty_scientificNames.iloc[0])

      year        date district municipality  \
2171  2023  2023-09-03    Viseu  Sernancelhe   

                                             parish                   local  \
2171  União das Freguesias de Sernancelhe e Sarzeda  Loteamento do Pinheiro   

       latitude  longitude    cause  elevation  ...  \
2171  45.900753  -8.001098  Natural        0.0  ...   

     hourly.direct_normal_irradiance_instant  \
2171                                   787.6   

      hourly.global_tilted_irradiance_instant  \
2171                                    739.0   

      hourly.terrestrial_radiation_instant  hourly.shortwave_radiation  \
2171                                1045.9                       720.0   

      hourly.direct_radiation  hourly.diffuse_radiation  \
2171                    597.0                     123.0   

      hourly.direct_normal_irradiance  hourly.global_tilted_irradiance  \
2171                            787.6                            720.0   

      hourly.terrest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


('Olea europaea L.', 523145.3541043815)