In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

# Load the CSV file
# Data file should be in the same directory as the code
file_path = r"flamingo.csv"
df_crane = pd.read_csv(file_path)

print(df_crane.head())
print(df_crane.shape)


                          GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
0  URN:CornellLabOfOrnithology:EBIRD:OBS2783288224                80   
1  URN:CornellLabOfOrnithology:EBIRD:OBS2786783152               700   
2  URN:CornellLabOfOrnithology:EBIRD:OBS2784026626               550   
3  URN:CornellLabOfOrnithology:EBIRD:OBS2803371669                80   
4  URN:CornellLabOfOrnithology:EBIRD:OBS2826101970                40   

        COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
0         India  19.141094  72.960988       11/01/2025   
1         Spain  38.166360  -0.730569       15/01/2025   
2  Saudi Arabia  26.490437  50.056602       14/01/2025   
3         Spain  40.658953   0.783074       21/01/2025   
4          Oman  17.881930  55.601978       29/01/2025   

  TIME OBSERVATIONS STARTED  DURATION MINUTES  
0                  07:34:00             299.0  
1                  10:33:00              59.0  
2                  08:15:00              80.0  
3                  08:52:0

In [2]:
df_crane['OBSERVATION DATE'] = pd.to_datetime(df_crane['OBSERVATION DATE'], format='%d/%m/%Y', errors='coerce')

# Define the date range
start_date = '2001-01-01'
end_date = '2020-06-30'

# Filter the DataFrame
df_crane = df_crane[(df_crane['OBSERVATION DATE'] >= start_date) & 
                       (df_crane['OBSERVATION DATE'] <= end_date)]

# Display the filtered DataFrame
print(df_crane.head())
print(df_crane.shape)
#df_crane.to_csv('Flamingo 2001-2020.csv', index=False)

                              GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
10090   URN:CornellLabOfOrnithology:EBIRD:OBS893538867                 1   
10091   URN:CornellLabOfOrnithology:EBIRD:OBS843912433                 1   
10092  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118                 1   
10093  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536                 3   
10094  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031                 X   

                    COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
10090  United Arab Emirates  25.169707  55.432797       2001-12-15   
10091                  Oman  25.672686  56.271832       2001-04-19   
10092          South Africa -26.350960  28.511238       2001-02-11   
10093          South Africa -26.350960  28.511238       2001-01-20   
10094          South Africa -26.350960  28.511238       2001-10-26   

      TIME OBSERVATIONS STARTED  DURATION MINUTES  
10090                       NaN               NaN  
10091             

In [3]:
df_rain1 = pd.read_csv(r'datasets/cleaned_precipitation.csv')

df_rain2 = pd.read_csv(r'datasets/cleaned_precipitation2.csv')


df_rain= pd.concat([df_rain1, df_rain2], ignore_index=True)

print(df_rain.head())
print(df_rain.shape)

  code             name      continent        date  precipitation  year
0  ABW    Aruba (Neth.)  North America  1901-01-01          33.30   NaN
1  AFG      Afghanistan           Asia  1901-01-01          48.01   NaN
2  AGO           Angola         Africa  1901-01-01         152.68   NaN
3  AIA  Anguilla (U.K.)  North America  1901-01-01          92.60   NaN
4  ALA          Finland         Europe  1901-01-01          15.14   NaN
(352764, 6)


In [4]:
df_temp = pd.read_csv(r'datasets/cleaned_temperature.csv')
print(df_temp.head())
print(df_temp.shape)

  Country_Code     Country_Name  Year_Month  Temperature
0          ABW    Aruba (Neth.)  1901-01-01        27.20
1          AFG      Afghanistan  1901-01-01        -0.82
2          AGO           Angola  1901-01-01        22.50
3          AIA  Anguilla (U.K.)  1901-01-01        26.10
4          ALA    Åland Islands  1901-01-01        -4.15
(360144, 4)


In [5]:
# --- 1. Standardize Country Names in Crane Dataset ---
# Replace the country names in the crane dataset to match the temperature and precipitation datasets
df_crane['COUNTRY'] = df_crane['COUNTRY'].replace({
    'TÃ¼rkiye': 'Turkey', 
    'Türkiye' : 'Turkey',
    'Western Sahara' : 'Morocco',
    'DR Congo' : 'Democratic Republic of Congo',
    'Vatican City (Holy See)' : 'Vatican City'# Handle the special character in 'TÃ¼rkiye'
})

# --- 2. Standardize Country Names in Temperature and Precipitation Datasets ---
# Replace the country names in the temperature dataset to match the crane dataset
df_temp['Country_Name'] = df_temp['Country_Name'].replace({
    'United States of America': 'United States',  # Standardize to 'United States'
    'Arab Republic of Egypt' : 'Egypt',
    'Republic of Korea' : 'South Korea',
    'D. P. R. of Korea' : 'North Korea',
    'Islamic Republic of Iran' : 'Iran',
    'Slovak Republic' : 'Slovakia',
    'Svalbard and Jan Mayen (Nor.)' : 'Svalbard',
    'Syrian Arab Republic' : 'Syria',
    'Isle of Man (U.K.)' : 'Isle of Man',
    'Gibraltar (U.K.)' : 'Gibraltar',
    'Republic of Yemen' : 'Yemen',
    'Kyrgyz Republic' : 'Kyrgyzstan',
    'Faroe Islands (Den.)' : 'Faroe Islands',
    'Jersey (U.K.)' : 'Jersey',
    'Guernsey (U.K.)' : 'Guernsey',
    'Russian Federation': 'Russia',
    'Türkiye' : 'Turkey',
    'The Gambia' : 'Gambia'# Standardize to 'Russia'
})

# --- 1. Replace 'China' with 'Taiwan' for rows where the country code is 'TWN' ---
df_rain.loc[df_rain['code'] == 'TWN', 'name'] = 'Taiwan'
df_rain.loc[df_rain['code'] == 'MTQ', 'name'] = 'Martinique'


# --- 3. Proceed with the merge or other steps ---
# Continue with the rest of your merge or processing as needed

# Replace the country names in the precipitation dataset
df_rain['name'] = df_rain['name'].replace({
    'United States of America': 'United States',  # Standardize to 'United States'
    'Arab Republic of Egypt' : 'Egypt',
    'Republic of Korea' : 'South Korea',
    'D. P. R. of Korea' : 'North Korea',
    'Islamic Republic of Iran' : 'Iran',
    'Slovak Republic' : 'Slovakia',
    'Svalbard and Jan Mayen (Nor.)' : 'Svalbard',
    'Syrian Arab Republic' : 'Syria',
    'Isle of Man (U.K.)' : 'Isle of Man',
    'Gibraltar (U.K.)' : 'Gibraltar',
    'Republic of Yemen' : 'Yemen',
    'Kyrgyz Republic' : 'Kyrgyzstan',
    'Faroe Islands (Den.)' : 'Faroe Islands',
    'Jersey (U.K.)' : 'Jersey',
    'Guernsey (U.K.)' : 'Guernsey',
    'Russian Federation': 'Russia',
    'Türkiye' : 'Turkey',
    'The Gambia' : 'Gambia'# Standardize to 'Russia'
})
df_rain = df_rain[~((df_rain['name'] == 'France') & (df_rain['code'] != 'FRA'))]
df_rain = df_rain[~((df_rain['name'] == 'Finland') & (df_rain['code'] != 'FIN'))]

In [6]:
# --- 3. Convert 'OBSERVATION DATE' to Datetime and Extract Year-Month in the Crane Dataset ---
# Convert 'OBSERVATION DATE' to datetime format
df_crane['OBSERVATION DATE'] = pd.to_datetime(df_crane['OBSERVATION DATE'], format='%d/%m/%Y', errors='coerce')


# Extract Year-Month (the first day of the month) from the 'OBSERVATION DATE'
df_crane['Year_Month'] = df_crane['OBSERVATION DATE'].dt.to_period('M').dt.to_timestamp()

# --- 4. Filter Crane Data to Exclude 2024 Data ---
# Filter the dataset to include only data up to 2023



print(df_crane.shape)


(126349, 9)


In [7]:

# --- 5. Ensure Weather Data has Year-Month Format ---
# Convert the 'Year_Month' in the temperature dataset to datetime
df_temp['Year_Month'] = pd.to_datetime(df_temp['Year_Month'])

# Convert the 'date' in the precipitation dataset to datetime, then extract Year-Month
df_rain['date'] = pd.to_datetime(df_rain['date'])
df_rain['Year_Month'] = df_rain['date'].dt.to_period('M').dt.to_timestamp()

# --- 6. Merge Crane Data with Temperature Data (only Temperature) ---
# Merge based on 'COUNTRY' and 'Year_Month', but only include the 'Temperature' column

merged_temp = pd.merge(
    df_crane, df_temp[['Country_Name', 'Year_Month', 'Temperature']],  # Select relevant columns
    how='left',  # Use a left join to keep all crane sightings
    left_on=['COUNTRY', 'Year_Month'],  # Match by country and Year-Month
    right_on=['Country_Name', 'Year_Month'] # Match by country and Year-Month
)
print(merged_temp.shape)

# Rename the temperature column to 'AVERAGE MONTH TEMPERATURE'
merged_temp.rename(columns={'Temperature': 'AVERAGE MONTH TEMPERATURE'}, inplace=True)
print(merged_temp.shape)
# --- 7. Merge with Precipitation Data (only Precipitation) ---
# Merge with the precipitation data on 'COUNTRY' and 'Year_Month', but only include the 'precipitation' column
merged = pd.merge(
    merged_temp, df_rain[['name', 'Year_Month', 'precipitation']],  # Select relevant columns
    how='left',  # Use a left join to keep all crane sightings
    left_on=['COUNTRY', 'Year_Month'],  # Merge on the same columns for Year-Month and country
    right_on=['name', 'Year_Month']
)




print(merged.shape)


# Rename the precipitation column to 'AVERAGE MONTH RAINFALL'
merged.rename(columns={'precipitation': 'AVERAGE MONTH RAINFALL'}, inplace=True)

# --- 8. Clean Up the Data ---
# Drop the extra country columns from merged datasets (if any)
merged.drop(columns=['Country_Name', 'name', 'Year_Month'], inplace=True)

# --- 9. Check the Result ---
print(merged.head())
print(merged.shape)

(126349, 11)
(126349, 11)
(126349, 13)
                          GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
0   URN:CornellLabOfOrnithology:EBIRD:OBS893538867                 1   
1   URN:CornellLabOfOrnithology:EBIRD:OBS843912433                 1   
2  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118                 1   
3  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536                 3   
4  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031                 X   

                COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
0  United Arab Emirates  25.169707  55.432797       2001-12-15   
1                  Oman  25.672686  56.271832       2001-04-19   
2          South Africa -26.350960  28.511238       2001-02-11   
3          South Africa -26.350960  28.511238       2001-01-20   
4          South Africa -26.350960  28.511238       2001-10-26   

  TIME OBSERVATIONS STARTED  DURATION MINUTES  AVERAGE MONTH TEMPERATURE  \
0                       NaN               NaN          

In [8]:
# --- 1. Drop Rows with 'Palestinian Territory' in the Country Column ---
merged = merged[merged['COUNTRY'] != 'Palestinian Territory']
merged = merged[merged['COUNTRY'] != 'British Indian Ocean Territory']
#merged = merged[merged['COUNTRY'] != 'Kosovo']

In [9]:
# Check for NaN values in the 'AVERAGE MONTH TEMPERATURE' and 'AVERAGE MONTH RAINFALL' columns
nan_temperature = merged['AVERAGE MONTH TEMPERATURE'].isna().sum()
nan_rainfall = merged['AVERAGE MONTH RAINFALL'].isna().sum()

# Print the number of NaN values in each column
print(f"Number of NaN values in 'AVERAGE MONTH TEMPERATURE': {nan_temperature}")
print(f"Number of NaN values in 'AVERAGE MONTH RAINFALL': {nan_rainfall}")

Number of NaN values in 'AVERAGE MONTH TEMPERATURE': 0
Number of NaN values in 'AVERAGE MONTH RAINFALL': 0


In [10]:
# Filter rows where 'AVERAGE MONTH TEMPERATURE' is NaN
nan_temperature_rows = merged[merged['AVERAGE MONTH TEMPERATURE'].isna()]

# Filter rows where 'AVERAGE MONTH RAINFALL' is NaN
nan_rainfall_rows = merged[merged['AVERAGE MONTH RAINFALL'].isna()]

# Print the rows with NaN values in 'AVERAGE MONTH TEMPERATURE'
print("Rows with NaN in 'AVERAGE MONTH TEMPERATURE':")
print(nan_temperature_rows)

# Print the rows with NaN values in 'AVERAGE MONTH RAINFALL'
print("\nRows with NaN in 'AVERAGE MONTH RAINFALL':")
print(nan_rainfall_rows)

Rows with NaN in 'AVERAGE MONTH TEMPERATURE':
Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL]
Index: []

Rows with NaN in 'AVERAGE MONTH RAINFALL':
Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL]
Index: []


In [11]:
#merged.to_csv('crane_temp_rain_2023.csv', index=False)

In [12]:
df_tree = pd.read_csv(r'datasets/cleaned_tree_cover.csv')
print(df_tree.head())
print(df_tree.shape)

   Unnamed: 0      country   area_ha  tree_cover_2000_ha  tree_cover_2010_ha  \
0           0  Afghanistan  64385715              205791               71797   
1           1  Afghanistan  64385715              205791               71797   
2           2  Afghanistan  64385715              205791               71797   
3           3  Afghanistan  64385715              205791               71797   
4           4  Afghanistan  64385715              205791               71797   

   gain_2000-2020_ha  year  tree_loss_ha  
0              10741  2001            88  
1              10741  2002           179  
2              10741  2003           244  
3              10741  2004           201  
4              10741  2005           236  
(5428, 8)


In [13]:
# Extract year from OBSERVATION DATE if it's not already done
merged['year'] = pd.to_datetime(merged['OBSERVATION DATE']).dt.year

df_tree['country'] = df_tree['country'].replace({
    'Svalbard and Jan Mayen': 'Svalbard',
    'Swaziland' : 'Eswatini',
    'Republic of Congo' : 'Congo',
    'Macedonia' : 'North Macedonia',
    'Democratic Republic of the Congo' : 'Democratic Republic of Congo'# Handle the special character in 'TÃ¼rkiye'
})


df_tree['percentage_loss'] = (df_tree['tree_loss_ha'] / df_tree['area_ha']) * 100
# Perform the merge with df_tree
merged2 = pd.merge(
    merged,
    df_tree[['year', 'country', 'tree_loss_ha', 'percentage_loss']],  # Select only relevant columns
    how='left', 
    left_on=['year', 'COUNTRY'], 
    right_on=['year', 'country']
)
# Keep only rows where the year is 2001 or later
#merged2 = merged2[merged2['year'] >= 2001]

# Drop the redundant 'country' column from df_tree if necessary
#merged2.drop(columns=['country'], inplace=True)






print(merged2.head())
print(merged2.shape)




                          GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
0   URN:CornellLabOfOrnithology:EBIRD:OBS893538867                 1   
1   URN:CornellLabOfOrnithology:EBIRD:OBS843912433                 1   
2  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118                 1   
3  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536                 3   
4  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031                 X   

                COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
0  United Arab Emirates  25.169707  55.432797       2001-12-15   
1                  Oman  25.672686  56.271832       2001-04-19   
2          South Africa -26.350960  28.511238       2001-02-11   
3          South Africa -26.350960  28.511238       2001-01-20   
4          South Africa -26.350960  28.511238       2001-10-26   

  TIME OBSERVATIONS STARTED  DURATION MINUTES  AVERAGE MONTH TEMPERATURE  \
0                       NaN               NaN                      23.39   
1                 

In [14]:
nan_tree = merged2['tree_loss_ha'].isna().sum()
print(nan_tree)

nan_tree_rows = merged2[merged2['tree_loss_ha'].isna()]
print(nan_tree_rows)

0
Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL, year, country, tree_loss_ha, percentage_loss]
Index: []


In [15]:
df_pop = pd.read_csv(r'datasets/Processed_PopulationData.csv')
print(df_pop.head())
print(df_pop.shape)

   Year      Country  Population
0  1975  Afghanistan  12185168.7
1  1976  Afghanistan  12251504.3
2  1977  Afghanistan  12317839.9
3  1978  Afghanistan  12384175.5
4  1979  Afghanistan  12450511.1
(12250, 3)


In [16]:
merged2['COUNTRY'] = merged2['COUNTRY'].replace({
    'Czech Republic': 'Czechia',  # Handle the special character in 'TÃ¼rkiye'
})
df_pop['Country'] = df_pop['Country'].replace({
    'Svalbard and Jan Mayen': 'Svalbard',  # Handle the special character in 'TÃ¼rkiye'
})
merged3 = pd.merge(
    merged2,
    df_pop[['Year', 'Country', 'Population']],  # Select only relevant columns
    how='left', 
    left_on=['year', 'COUNTRY'], 
    right_on=['Year', 'Country']
)
merged3.drop(columns=['Country', 'Year'], inplace=True)
print(merged3.head())

                          GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
0   URN:CornellLabOfOrnithology:EBIRD:OBS893538867                 1   
1   URN:CornellLabOfOrnithology:EBIRD:OBS843912433                 1   
2  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118                 1   
3  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536                 3   
4  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031                 X   

                COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
0  United Arab Emirates  25.169707  55.432797       2001-12-15   
1                  Oman  25.672686  56.271832       2001-04-19   
2          South Africa -26.350960  28.511238       2001-02-11   
3          South Africa -26.350960  28.511238       2001-01-20   
4          South Africa -26.350960  28.511238       2001-10-26   

  TIME OBSERVATIONS STARTED  DURATION MINUTES  AVERAGE MONTH TEMPERATURE  \
0                       NaN               NaN                      23.39   
1                 

In [17]:
nan_pop = merged3['Population'].isna().sum()
print(nan_pop)

nan_pop_rows = merged3[merged3['Population'].isna()]
print(nan_pop_rows)

0
Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL, year, country, tree_loss_ha, percentage_loss, Population]
Index: []


In [18]:
merged3.to_csv('flamingo_temp_rain_pop_tree_2001-2020.csv', index=False)
#merged3.drop(columns=['country'], inplace=True)
duplicates = merged3[merged3.duplicated()]
print(duplicates)
merged3.duplicated().sum()


Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL, year, country, tree_loss_ha, percentage_loss, Population]
Index: []


np.int64(0)

In [19]:
df_veg = pd.read_csv(r'datasets/flamingo_ndvi_turbine.csv')
print(df_veg.head())
print(df_veg.shape)
df_veg.isna().sum()

   Unnamed: 0                         GLOBAL UNIQUE IDENTIFIER  \
0           0   URN:CornellLabOfOrnithology:EBIRD:OBS893538867   
1           1   URN:CornellLabOfOrnithology:EBIRD:OBS843912433   
2           2  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118   
3           3  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536   
4           4  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031   

  OBSERVATION COUNT               COUNTRY   LATITUDE  LONGITUDE  \
0                 1  United Arab Emirates  25.169707  55.432797   
1                 1                  Oman  25.672686  56.271832   
2                 1          South Africa -26.350960  28.511238   
3                 3          South Africa -26.350960  28.511238   
4                 X          South Africa -26.350960  28.511238   

  OBSERVATION DATE TIME OBSERVATIONS STARTED  DURATION MINUTES  \
0       2001-12-15                       NaN               NaN   
1       2001-04-19                       NaN               NaN   
2 

Unnamed: 0                       0
GLOBAL UNIQUE IDENTIFIER         0
OBSERVATION COUNT                0
COUNTRY                          0
LATITUDE                         0
LONGITUDE                        0
OBSERVATION DATE                 0
TIME OBSERVATIONS STARTED    22605
DURATION MINUTES             28601
WT_COUNT_10KM_RADIUS             0
NDVI                          8569
dtype: int64

In [22]:

merged4 = pd.merge(
    merged3,
    df_veg[[ 'GLOBAL UNIQUE IDENTIFIER', 'NDVI', 'WT_COUNT_10KM_RADIUS']],  # Select only relevant columns
    how='left', 
    left_on=['GLOBAL UNIQUE IDENTIFIER'], 
    right_on=['GLOBAL UNIQUE IDENTIFIER']
)
#merged4 = merged4[merged4['OBSERVATION DATE'] <= '2020-06-30']
print(merged4.head())
print(merged4.shape)
merged4.drop(columns=['country', 'year'], inplace=True)
print(merged4.shape)

                          GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
0   URN:CornellLabOfOrnithology:EBIRD:OBS893538867                 1   
1   URN:CornellLabOfOrnithology:EBIRD:OBS843912433                 1   
2  URN:CornellLabOfOrnithology:EBIRD:OBS2143453118                 1   
3  URN:CornellLabOfOrnithology:EBIRD:OBS2112747536                 3   
4  URN:CornellLabOfOrnithology:EBIRD:OBS2218298031                 X   

                COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
0  United Arab Emirates  25.169707  55.432797       2001-12-15   
1                  Oman  25.672686  56.271832       2001-04-19   
2          South Africa -26.350960  28.511238       2001-02-11   
3          South Africa -26.350960  28.511238       2001-01-20   
4          South Africa -26.350960  28.511238       2001-10-26   

  TIME OBSERVATIONS STARTED  DURATION MINUTES  AVERAGE MONTH TEMPERATURE  \
0                       NaN               NaN                      23.39   
1                 


df_veg['date'] = pd.to_datetime(df_veg['date'], errors='coerce')
print(df_veg.head())

merged4 = pd.merge(
    merged3,
    df_veg[['date', 'latitude', 'longitude', 'NDVI']],  # Select only relevant columns
    how='left', 
    left_on=['OBSERVATION DATE', 'LATITUDE', 'LONGITUDE'], 
    right_on=['date', 'latitude', 'longitude']
)
merged4 = merged4[merged4['OBSERVATION DATE'] <= '2020-06-30']

print(merged4.head())
print(merged4.shape)


In [23]:
nan_veg = merged4['NDVI'].isna().sum()
print(nan_veg)

nan_veg_rows = merged4[merged4['NDVI'].isna()]
print(nan_veg_rows)

8568
                              GLOBAL UNIQUE IDENTIFIER OBSERVATION COUNT  \
20      URN:CornellLabOfOrnithology:EBIRD:OBS102007777               200   
57      URN:CornellLabOfOrnithology:EBIRD:OBS771932551                 X   
318     URN:CornellLabOfOrnithology:EBIRD:OBS495396789                14   
325     URN:CornellLabOfOrnithology:EBIRD:OBS495985624                15   
345     URN:CornellLabOfOrnithology:EBIRD:OBS535814466                 X   
...                                                ...               ...   
126205  URN:CornellLabOfOrnithology:EBIRD:OBS868411651              2000   
126223  URN:CornellLabOfOrnithology:EBIRD:OBS944333324                35   
126227  URN:CornellLabOfOrnithology:EBIRD:OBS877094646                50   
126247  URN:CornellLabOfOrnithology:EBIRD:OBS850453960                12   
126261  URN:CornellLabOfOrnithology:EBIRD:OBS859996649                 X   

        COUNTRY   LATITUDE  LONGITUDE OBSERVATION DATE  \
20       Turkey  38.4611

In [24]:
merged4.isna().sum()

GLOBAL UNIQUE IDENTIFIER         0
OBSERVATION COUNT                0
COUNTRY                          0
LATITUDE                         0
LONGITUDE                        0
OBSERVATION DATE                 0
TIME OBSERVATIONS STARTED    22604
DURATION MINUTES             28600
AVERAGE MONTH TEMPERATURE        0
AVERAGE MONTH RAINFALL           0
tree_loss_ha                     0
percentage_loss                  0
Population                       0
NDVI                          8568
WT_COUNT_10KM_RADIUS             0
dtype: int64

In [25]:

duplicates = merged4[merged4.duplicated()]
print(duplicates)
merged4.duplicated().sum()

Empty DataFrame
Columns: [GLOBAL UNIQUE IDENTIFIER, OBSERVATION COUNT, COUNTRY, LATITUDE, LONGITUDE, OBSERVATION DATE, TIME OBSERVATIONS STARTED, DURATION MINUTES, AVERAGE MONTH TEMPERATURE, AVERAGE MONTH RAINFALL, tree_loss_ha, percentage_loss, Population, NDVI, WT_COUNT_10KM_RADIUS]
Index: []


np.int64(0)

In [26]:
merged4.to_csv('flamingo_temp_rain_tree_veg_pop_turb_2001-2020.csv', index=False)

In [None]:
merged4[['GLOBAL UNIQUE IDENTIFIER']].nunique()


In [None]:
num_unique_combinations = merged[['year', 'COUNTRY']].drop_duplicates().shape[0]

print(num_unique_combinations)

NaN situation for 2001-2020 on NDVI
what about including more dates with missing attributes


temp and rain up to 2023
tree loss 2001-2023
ndvi 1999-2020  (missing data)
windmill all dates
