In [40]:
import pandas as pd
import math
import requests
import os
import ast

current_dir = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(current_dir))
print(root_dir)
_RAWFIRESPATH = f"{root_dir}/FirePrediction/RawData/Historical_FiresRAW"

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters

def checkNearestPoint(row):    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[dfTreesDRP['stateProvince'].str.lower() == district]

    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name, nearest_distance

    
def check_district(row, precision=1000):
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values, excluding NaN values
    unique_names = close_points_df['scientificName'].dropna().unique()

    # If 'scientificNames' exists in row and is not NaN, append unique names if they don't exist
    if 'scientificNames' in row and pd.notna(row['scientificNames']):
        existing_names = str(row['scientificNames']).split('; ')
        for name in unique_names:
            if name not in existing_names:
                existing_names.append(name)
        return '; '.join(existing_names)
    
    # If 'scientificNames' is NaN, just return the unique names
    return '; '.join(unique_names)

/home/ori/Desktop/SPAWN


In [7]:
_year = 2023
_currentYearMissingValues = pd.read_csv(f'DatasetWTrees/PreviousVersions/{_year}_checkDistrict.csv')

na_count = _currentYearMissingValues['scientificNames'].isna().sum()
print(na_count)

non_na_count = _currentYearMissingValues['scientificNames'].notna().sum()
print(non_na_count)

1
2498


In [8]:
na_rows = _currentYearMissingValues[_currentYearMissingValues['scientificNames'].isna()]
print(na_rows)

      year        date district municipality  \
2171  2023  2023-09-03    Viseu  Sernancelhe   

                                             parish                   local  \
2171  União das Freguesias de Sernancelhe e Sarzeda  Loteamento do Pinheiro   

       latitude  longitude    cause  elevation  ...  \
2171  45.900753  -8.001098  Natural        0.0  ...   

     hourly.direct_normal_irradiance_instant  \
2171                                   787.6   

      hourly.global_tilted_irradiance_instant  \
2171                                    739.0   

      hourly.terrestrial_radiation_instant  hourly.shortwave_radiation  \
2171                                1045.9                       720.0   

      hourly.direct_radiation  hourly.diffuse_radiation  \
2171                    597.0                     123.0   

      hourly.direct_normal_irradiance  hourly.global_tilted_irradiance  \
2171                            787.6                            720.0   

      hourly.terrest

In [18]:
print(checkNearestPoint(na_rows.iloc[0]))

('Olea europaea L.', 523145.3541043815)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


In [25]:
_year = 2023
#40.8998417, -7.4900188
row = na_rows.iloc[0]

LAT = 40.8998417
LON = -7.4900188
_DATA = row['date']
HORA = row['hourly.time'].split("T")[1]
DIA = row['date'].split("-")[2]
MES = row['date'].split("-")[1]

_fileName = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}.csv.csv"

_fileName_radiation = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}_radiation.csv"


#url_rad = f"https://archive-api.open-meteo.com/v1/archive?latitude={LAT}&longitude={LON}&start_date={_DATA}&end_date={_DATA}&hourly=shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,global_tilted_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,global_tilted_irradiance_instant,terrestrial_radiation_instant"
url = f"https://archive-api.open-meteo.com/v1/archive?latitude={LAT}&longitude={LON}&start_date={_DATA}&end_date={_DATA}&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,is_day,sunshine_duration,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,global_tilted_irradiance_instant,terrestrial_radiation_instant&timezone=GMT"
response = requests.get(url)
#response_rad = requests.get(url_rad)

#print(response_rad.status_code)

if((response.status_code == 200)):
    data = response.json()
    #data_rad = response_rad.json()

    df = pd.json_normalize(data)
    df.to_csv(f"MissingValues/{_fileName}", index=False)

    #AlgorithmTreesOnDataset/MissingValues
    #df_rad = pd.json_normalize(data_rad)
    #df_rad.to_csv(f"MissingValues/{_fileName_radiation}", index=False)
else:
    print("Error row:")

In [32]:
_year = 2005
_file = f"{_RAWFIRESPATH}/{_year}/2005-01-01_37.7388357_-8.750062888101874.csv"
df = pd.read_csv(f"{_file}")
# print(df.head())
column_names = df.columns.tolist()
print(column_names)LAT = row['LAT']
LON = row['LON']

# Get all column names
column_names = df.columns.tolist()

# Find columns that contain 'hourly_units'
columns_to_remove = [col for col in column_names if 'hourly_units' in col]
columns_to_remove.extend(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation'])

# Remove the columns
df = df.drop(columns=columns_to_remove)

# Print the updated column names
updated_column_names = df.columns.tolist()
print(updated_column_names)

_file_radiation = _file.replace(".csv", "_radiation.csv")
df = pd.read_csv(f"{_file_radiation}")
print(df.head())

df = pd.read_csv(f"{_file_radiation}")
# print(df.head())
column_names = df.columns.tolist()
print(column_names)

# Get all column names
column_names = df.columns.tolist()
LAT = row['LAT']
LON = row['LON']
# Find columns that contain 'hourly_units'
columns_to_remove = [col for col in column_names if 'hourly_units' in col]
columns_to_remove.extend(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'hourly.time', 'hourly.global_tilted_irradiance_instant', 'hourly.diffuse_radiation_instant', 'hourly.terrestrial_radiation_instant', 'hourly.direct_radiation_instant', 'hourly.shortwave_radiation_instant', 'hourly.direct_normal_irradiance_instant'])

# Remove the columns
df = df.drop(columns=columns_to_remove)

# Print the updated column names
updated_column_names_radiation = df.columns.tolist()
print(updated_column_names_radiation)

# Convert the lists to sets
set1 = set(updated_column_names_radiation)
set2 = set(updated_column_names)

# Find the common elements
common_elements = set1 & set2

# Print the common elements
print(common_elements)


header = ['year', 'date', 'district', 'municipality', 'parish', 'local', 'latitude', 'longitude', 'cause']
LAT = row['LAT']
LON = row['LON']
columns = header + updated_column_names + updated_column_names_radiation

#/home/ori/Desktop/SPAWN/FirePrediction/RawData/Historical_FiresRAW/2005/2005-01-01_37.7388357_-8.750062888101874.csv
df = pd.DataFrame(columns=columns)

df.to_csv('MissingValues/missingValues2023.csv', index=False)


df.to_csv('MissingValues/missingValues2023.csv', index=False)

#/home/ori/Desktop/SPAWN/FirePrediction/RawData/Historical_FiresRAW/2005/2005-01-01_37.7388357_-8.750062888101874.csv

['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'hourly_units.time', 'hourly_units.temperature_2m', 'hourly_units.relative_humidity_2m', 'hourly_units.dew_point_2m', 'hourly_units.apparent_temperature', 'hourly_units.precipitation', 'hourly_units.rain', 'hourly_units.snowfall', 'hourly_units.snow_depth', 'hourly_units.weather_code', 'hourly_units.pressure_msl', 'hourly_units.surface_pressure', 'hourly_units.cloud_cover', 'hourly_units.cloud_cover_low', 'hourly_units.cloud_cover_mid', 'hourly_units.cloud_cover_high', 'hourly_units.et0_fao_evapotranspiration', 'hourly_units.vapour_pressure_deficit', 'hourly_units.wind_speed_10m', 'hourly_units.wind_speed_100m', 'hourly_units.wind_direction_10m', 'hourly_units.wind_direction_100m', 'hourly_units.wind_gusts_10m', 'hourly_units.soil_temperature_0_to_7cm', 'hourly_units.soil_temperature_7_to_28cm', 'hourly_units.soil_temperature_28_to_100cm', 'hourly_units.soil_temperatur

In [37]:
ANO = row['year']
DISTRICTO = row['district']
CONCELHO = row['municipality']
FREGUESIA = row['parish']
CAUSA = row['cause']
LOCAL = row['local']
print(HORA)
HORA = int(HORA.split(":")[0])

_fileName = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}.csv.csv"
_fileName_radiation = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}_radiation.csv"

dfMet = pd.read_csv(f"MissingValues/{_fileName}")
dfRad = pd.read_csv(f"MissingValues/{_fileName_radiation}")

list_from_string = ast.literal_eval(dfMet[updated_column_names[2]].iloc[0])[HORA]

print(list_from_string)

list_from_string = ast.literal_eval(dfMet[updated_column_names[1]].iloc[0])[HORA]

print(list_from_string)

new_data = {
    'year': ANO,
    'date': f"{_DATA}",
    'district': DISTRICTO,
    'municipality': CONCELHO,
    'parish': FREGUESIA,
    'local': LOCAL,
    'latitude': LAT,
    'longitude': LON,
    'cause': CAUSA,
    'elevation': dfMet['elevation'].iloc[0]
}

try:
    for i in range(1, len(updated_column_names)):
        new_data[updated_column_names[i]] = ast.literal_eval(dfMet[updated_column_names[i]].iloc[0])[HORA]
except Exception as e:
    print("->", e)

try:
    for i in range(0, len(updated_column_names_radiation)):
        new_data[updated_column_names_radiation[i]] = ast.literal_eval(dfRad[updated_column_names_radiation[i]].iloc[0])[HORA]
except Exception as e:
    print("-x", e)

new_df = pd.DataFrame(new_data, index=[0])

new_df.to_csv('MissingValues/missingValues2023.csv', mode='a', header=False, index=False)

12:00
18.7
2023-09-03T12:00


In [39]:
_year = 2023
newValues = pd.read_csv(f'MissingValues/missingValues{_year}.csv')
print(checkNearestPoint(newValues.iloc[0]))

newValues['scientificNames'] = newValues.apply(check_district, axis=1)
newValues.to_csv(f'MissingValues/{_year}_checkDistrict.csv', index=False)

('Tracheophyta', 213.8836457305809)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


NameError: name 'check_district' is not defined