In [1]:
import pandas as pd
import math
import requests
import os
import ast

current_dir = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(current_dir))
print(root_dir)
_RAWFIRESPATH = f"{root_dir}/FirePrediction/RawData/Historical_FiresRAW"

dfTreesDRP = pd.read_csv('TreesPortugueseTerritoryDropped.csv')

def DistanceTwoPoints(lat1, lon1, lat2, lon2):
    R = 6371e3  # Radius of the Earth in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    return R * c  # Distance in meters

def checkNearestPoint(row):    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[dfTreesDRP['stateProvince'].str.lower() == district]

    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name, nearest_distance

def returnNearestTree(row):    
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']

    filtered_df = dfTreesDRP[dfTreesDRP['stateProvince'].str.lower() == district].copy()
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)

    filtered_df = filtered_df.sort_values('distance')
    
    # Get the 'scientificName' and 'distance' of the nearest point
    nearest_name = filtered_df.iloc[0]['scientificName']
    nearest_distance = filtered_df.iloc[0]['distance']
    
    return nearest_name

    
def check_district(row, precision=1000):
    district = str(row['district']).lower()
    lat1 = row['latitude']
    lon1 = row['longitude']
    
    # Filter dfTreesDRP based on 'locality' and 'stateProvince'
    filtered_df = dfTreesDRP[(dfTreesDRP['stateProvince'].str.lower() == district)]
    
    # Calculate the distance for each row in the filtered DataFrame
    filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)
    
    # Filter the DataFrame based on the distance
    close_points_df = filtered_df[filtered_df['distance'] < precision]
    
    # Get the unique 'scientificName' values, excluding NaN values
    unique_names = close_points_df['scientificName'].dropna().unique()

    # If 'scientificNames' exists in row and is not NaN, append unique names if they don't exist
    if 'scientificNames' in row and pd.notna(row['scientificNames']):
        existing_names = str(row['scientificNames']).split('; ')
        for name in unique_names:
            if name not in existing_names:
                existing_names.append(name)
        return '; '.join(existing_names)
    
    # If 'scientificNames' is NaN, just return the unique names
    return '; '.join(unique_names)

/home/ori/Desktop/SPAWN


In [2]:
_year = 2010
_currentYearMissingValues = pd.read_csv(f'DatasetWTrees/PreviousVersions/{_year}_checkDistrict.csv')

na_count = _currentYearMissingValues['scientificNames'].isna().sum()
print(na_count)

non_na_count = _currentYearMissingValues['scientificNames'].notna().sum()
print(non_na_count)

empty_count = (_currentYearMissingValues['scientificNames'] == '').sum()
print(empty_count)

non_empty_count = (_currentYearMissingValues['scientificNames'] != '').sum()
print(non_empty_count)

4
14427
0
14431


In [3]:
na_rows = _currentYearMissingValues[_currentYearMissingValues['scientificNames'].isna()]
print(na_rows)
#39.0805764,-8.771974

      year        date        district       municipality             parish  \
0     2010  2010-02-14           Porto  Vila Nova de Gaia  Oliveira do Douro   
1968  2010  2010-06-02        Santarém            Cartaxo             Valada   
6551  2010  2010-08-14  Castelo Branco              Sertã           Cabeçudo   
9638  2010  2010-08-12           Porto         Felgueiras           Pinheiro   

                                       local   latitude  longitude  \
0                            Monte da Virgem  41.112581  -8.597502   
1968  ESTRADA DO SETIL (VILA CHÃ DE OURIQUE)  39.081195  -8.760559   
6551                                   Tojal  38.848634  -9.144522   
9638                                  Igreja  41.362214  -8.113012   

             cause  elevation  ... hourly.direct_normal_irradiance_instant  \
0               NC      206.0  ...                                   253.2   
1968            NC        6.0  ...                                   278.0   
6551  Desconhe

In [5]:
na_rows['scientificNames'] = na_rows.apply(check_district, axis=1)
#newValues.to_csv(f'MissingValues/{_year}_checkDistrict.csv', index=False)
print(na_rows.iloc[0]['scientificNames'])

Magnoliopsida; Tracheophyta; Pinus pinaster Aiton; Pinus pinea L.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  na_rows['scientificNames'] = na_rows.apply(check_district, axis=1)


In [10]:
_year = 2010
_currentYearMissingValuesX = pd.read_csv(f'DatasetWTrees/PreviousVersions/{_year}_checkDistrict.csv')
print(check_district(_currentYearMissingValuesX.iloc[2]))

Vitis vinifera L.; Tracheophyta; Magnoliopsida


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


In [23]:
index = 3
print(checkNearestPoint(na_rows.iloc[index]))

('Pinales', 1960.7743005616587)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


In [126]:
na_rows.at[na_rows.index[2], 'scientificNames'] = returnNearestTree(na_rows.iloc[2])
print(na_rows.at[na_rows.index[2], 'scientificNames'])
na_rows.to_csv('/home/ori/Desktop/SPAWN/FirePrediction/AlgorithmTreesOnDataset/DatasetWTrees/PreviousVersions/2016_checkDistrict2v2.csv')

Tracheophyta


In [25]:
_year = 2001
#40.8998417, -7.4900188
#38.6584269,-9.0193437
#40.9603373,-8.5505025
#41.4465162,-8.034185

#41.362214, -8.113012

#39.8304404,-8.1582464

#39.8097855,-7.7609527
#39.8304179,-8.1868721
#40.0122252,-8.8899399
#39.825922,-7.4729952

#41.0965013,-8.3492967
#41.0965114,-8.3779225

#41.8117489,-6.7458274
#40.5496832,-7.2439643

#41.37295,-8.3307829

#39.8474782,-7.5565182

#39.8116193,-7.5456793

#40.1348529,-7.4758689

#41.1389565,-7.9702532

#41.3086548,-8.6318609

#38.6997227,-8.9636176

#41.3732232,-8.156543

#38.6679667,-9.2048318

#41.5850409,-8.107566

#38.9549497,-8.7373542

#40.4684347,-7.1253137
#40.5450153,-7.2617102

row = na_rows.iloc[index]


#39.8052068,-7.7524215,15
#40.8454766,-8.6909437
#41.4684787,-8.582788
#41.5603721,-7.9120912
#41.6764835,-8.1744881 - rossa
#41.5853434,-8.1051347
#41.729705,-8.1305239
#41.3103597,-8.6095624
#38.6997227,-8.9636176
#39.7982847,-7.8147625
#40.0274487,-7.5208524
#41.5846572,-8.6081022
#41.7376564,-8.2257958
#41.6219633,-8.3040071
#41.4399383,-8.6026327
#41.1417954,-7.9611701
#41.3674592,-8.1661637
#38.9549497,-8.7373542
#38.6997227,-8.9636176
#39.1079604,-8.7484528
#39.1729556,-8.7925102
#39.8304404,-8.1582679
#41.3678107,-8.1611126
LAT = 41.3678107
LON = -8.1611126
_DATA = row['date']
HORA = row['hourly.time'].split("T")[1]
DIA = row['date'].split("-")[2]
MES = row['date'].split("-")[1]

_fileName = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}.csv.csv"

_fileName_radiation = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}_radiation.csv"


url_rad = f"https://archive-api.open-meteo.com/v1/archive?latitude={LAT}&longitude={LON}&start_date={_DATA}&end_date={_DATA}&hourly=shortwave_radiation,direct_radiation,diffuse_radiation,direct_normal_irradiance,global_tilted_irradiance,terrestrial_radiation,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,global_tilted_irradiance_instant,terrestrial_radiation_instant"
url = f"https://archive-api.open-meteo.com/v1/archive?latitude={LAT}&longitude={LON}&start_date={_DATA}&end_date={_DATA}&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm,is_day,sunshine_duration,shortwave_radiation_instant,direct_radiation_instant,diffuse_radiation_instant,direct_normal_irradiance_instant,global_tilted_irradiance_instant,terrestrial_radiation_instant&timezone=GMT"
response = requests.get(url)
#response_rad = requests.get(url_rad)

#print(response.status_code)

if((response.status_code == 200)):
    data = response.json()
    #data_rad = response_rad.json()

    df = pd.json_normalize(data)
    df.to_csv(f"MissingValues/{_fileName}", index=False)

    #AlgorithmTreesOnDataset/MissingValues
    #df_rad = pd.json_normalize(data_rad)
    #df_rad.to_csv(f"MissingValues/{_fileName_radiation}", index=False)
else:
    print("Error row:")

In [26]:
print(_fileName)

12_08_12:00_41.3678107_-8.1611126.csv.csv


In [15]:
_year = 2005
_file = f"{_RAWFIRESPATH}/{_year}/2005-01-01_37.7388357_-8.750062888101874.csv"
df = pd.read_csv(f"{_file}")
# print(df.head())
column_names = df.columns.tolist()
print(column_names)

# Get all column names
column_names = df.columns.tolist()

# Find columns that contain 'hourly_units'
columns_to_remove = [col for col in column_names if 'hourly_units' in col]
columns_to_remove.extend(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation'])

# Remove the columns
df = df.drop(columns=columns_to_remove)

# Print the updated column names
updated_column_names = df.columns.tolist()
print(updated_column_names)

_file_radiation = _file.replace(".csv", "_radiation.csv")
df = pd.read_csv(f"{_file_radiation}")
print(df.head())

df = pd.read_csv(f"{_file_radiation}")
# print(df.head())
column_names = df.columns.tolist()
print(column_names)

# Get all column names
column_names = df.columns.tolist()

# Find columns that contain 'hourly_units'
columns_to_remove = [col for col in column_names if 'hourly_units' in col]
columns_to_remove.extend(['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'hourly.time', 'hourly.global_tilted_irradiance_instant', 'hourly.diffuse_radiation_instant', 'hourly.terrestrial_radiation_instant', 'hourly.direct_radiation_instant', 'hourly.shortwave_radiation_instant', 'hourly.direct_normal_irradiance_instant'])

# Remove the columns
df = df.drop(columns=columns_to_remove)

# Print the updated column names
updated_column_names_radiation = df.columns.tolist()
print(updated_column_names_radiation)

# Convert the lists to sets
set1 = set(updated_column_names_radiation)
set2 = set(updated_column_names)

# Find the common elements
common_elements = set1 & set2

# Print the common elements
print(common_elements)


header = ['year', 'date', 'district', 'municipality', 'parish', 'local', 'latitude', 'longitude', 'cause']

columns = header + updated_column_names + updated_column_names_radiation

df = pd.DataFrame(columns=columns)

df.to_csv('MissingValues/missingValues2010.csv', index=False)

['latitude', 'longitude', 'generationtime_ms', 'utc_offset_seconds', 'timezone', 'timezone_abbreviation', 'elevation', 'hourly_units.time', 'hourly_units.temperature_2m', 'hourly_units.relative_humidity_2m', 'hourly_units.dew_point_2m', 'hourly_units.apparent_temperature', 'hourly_units.precipitation', 'hourly_units.rain', 'hourly_units.snowfall', 'hourly_units.snow_depth', 'hourly_units.weather_code', 'hourly_units.pressure_msl', 'hourly_units.surface_pressure', 'hourly_units.cloud_cover', 'hourly_units.cloud_cover_low', 'hourly_units.cloud_cover_mid', 'hourly_units.cloud_cover_high', 'hourly_units.et0_fao_evapotranspiration', 'hourly_units.vapour_pressure_deficit', 'hourly_units.wind_speed_10m', 'hourly_units.wind_speed_100m', 'hourly_units.wind_direction_10m', 'hourly_units.wind_direction_100m', 'hourly_units.wind_gusts_10m', 'hourly_units.soil_temperature_0_to_7cm', 'hourly_units.soil_temperature_7_to_28cm', 'hourly_units.soil_temperature_28_to_100cm', 'hourly_units.soil_temperatur

In [27]:
row = na_rows.iloc[index]

ANO = row['year']
DISTRICTO = row['district']
CONCELHO = row['municipality']
FREGUESIA = row['parish']
CAUSA = row['cause']
LOCAL = row['local']
print(HORA)
#HORA = int(HORA.split(":")[0])
#print(str(HORA))
HORA = str("12")
_fileName = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}.csv.csv"
_fileName_radiation = f"{DIA}_{MES}_{HORA}_{LAT}_{LON}_radiation.csv"

dfMet = pd.read_csv(f"MissingValues/{_fileName}")
dfRad = pd.read_csv(f"MissingValues/{_fileName_radiation}")

#MUDAR AQUI TAMBEM
HORA = 12

list_from_string = ast.literal_eval(dfMet[updated_column_names[2]].iloc[0])[HORA]

print(list_from_string)

list_from_string = ast.literal_eval(dfMet[updated_column_names[1]].iloc[0])[HORA]

print(list_from_string)

new_data = {
    'year': ANO,
    'date': f"{_DATA}",
    'district': DISTRICTO,
    'municipality': CONCELHO,
    'parish': FREGUESIA,
    'local': LOCAL,
    'latitude': LAT,
    'longitude': LON,
    'cause': CAUSA,
    'elevation': dfMet['elevation'].iloc[0]
}

try:
    for i in range(1, len(updated_column_names)):
        new_data[updated_column_names[i]] = ast.literal_eval(dfMet[updated_column_names[i]].iloc[0])[HORA]
except Exception as e:
    print("->", e)

try:
    for i in range(0, len(updated_column_names_radiation)):
        new_data[updated_column_names_radiation[i]] = ast.literal_eval(dfRad[updated_column_names_radiation[i]].iloc[0])[HORA]
except Exception as e:
    print("-x", e)

new_df = pd.DataFrame(new_data, index=[0])

new_df.to_csv('MissingValues/missingValues2010.csv', mode='a', header=False, index=False)

12:00
26.3
2010-08-12T12:00


In [28]:
_year = 2010
newValues = pd.read_csv(f'MissingValues/missingValues{_year}.csv')
print(checkNearestPoint(newValues.iloc[0]))

newValues['scientificNames'] = newValues.apply(check_district, axis=1)
newValues.to_csv(f'MissingValues/{_year}_checkDistrict.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['distance'] = filtered_df.apply(lambda x: DistanceTwoPoints(lat1, lon1, x['decimalLatitude'], x['decimalLongitude']), axis=1)


('Tracheophyta', 235.5433930495615)
