In [1]:
import pandas as pd
import requests
import time
import geopandas as gpd
import numpy as np

In [2]:
df = pd.read_csv('WindTurbineData.csv')
print(df.head())

   Capacity (MW)        Status  Start Year  Latitude  Longitude
0           10.0     Operating      2014.0   28.4624    -0.0576
1         1100.0  Construction      2026.0   26.2540    29.2675
2        10000.0     Accounced        -1.0   26.5583    31.6773
3          160.0  Construction        -1.0   29.6607    32.3314
4          502.0  Construction      2025.0   28.1338    33.2602


# Converting Latitude/Longitude to Country

In [54]:
df = pd.read_csv("WindTurbineData.csv")

df['Latitude'] = df['Latitude'].replace([np.inf, -np.inf], np.nan)
df['Longitude'] = df['Longitude'].replace([np.inf, -np.inf], np.nan)
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)

gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs="EPSG:4326")

world = gpd.read_file("50m_cultural/ne_50m_admin_0_countries.shp") 

gdf = gpd.sjoin(gdf, world, how="left", predicate="within")

df['Country'] = gdf['ADMIN'] 

print(df.head(20))
df.to_csv("geocoded_data.csv", index=False)

    Capacity (MW)        Status  Start Year  Latitude  Longitude  Country
0            10.0     Operating      2014.0   28.4624    -0.0576  Algeria
1          1100.0  Construction      2026.0   26.2540    29.2675    Egypt
2         10000.0     Accounced        -1.0   26.5583    31.6773    Egypt
3           160.0  Construction        -1.0   29.6607    32.3314    Egypt
4           502.0  Construction      2025.0   28.1338    33.2602    Egypt
5           250.0  Construction      2023.0   28.3553    33.0622    Egypt
6           240.0     Operating      2018.0   25.8577    34.4182    Egypt
7           220.0     Operating      2018.0   25.8577    34.4182    Egypt
8           120.0     Operating      2018.0   25.8577    34.4182    Egypt
9           263.0     Operating      2019.0   28.4005    32.9572    Egypt
10          500.0  Construction      2025.0   28.1338    33.2602    Egypt
11          252.0     Operating      2021.0   28.1338    33.2602    Egypt
12           30.0     Operating      2

In [63]:
df2 = pd.read_csv("geocoded_data.csv")

total_nan_count = df2['Country'].isna().sum()

print(f"Total NaN values in 'Country' column: {total_nan_count}")

Total NaN values in 'Country' column: 1159


Next, we are going to try using 'Open Cage', an API, to fill NaN values that the previous code missed.

In [61]:
import pandas as pd
import requests
import time

# OpenCage API Key (replace with your actual key)
API_KEY = "5051ab3a7e0a4703a29281814f42cb7f"

# Load the dataset
file_path = "geocoded_data.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Function to get country from latitude and longitude
def get_country(lat, lon):
    url = f"https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={API_KEY}"
    try:
        response = requests.get(url)
        data = response.json()
        if data["results"]:
            return data["results"][0]["components"].get("country", None)
    except Exception as e:
        print(f"Error: {e}")
    return None

# Find missing country values
missing_indices = df[df['Country'].isna()].index

# Process each missing value with a delay
for i, idx in enumerate(missing_indices):
    lat, lon = df.loc[idx, ["Latitude", "Longitude"]]
    country = get_country(lat, lon)
    if country:
        df.at[idx, "Country"] = country
    time.sleep(1)  # Respect API rate limit (1 request per second)

# Save the updated file
output_path = "geocoded_data_filled.csv"
df.to_csv(output_path, index=False)
print(f"Updated file saved as {output_path}")


Updated file saved as geocoded_data_filled.csv


In [64]:
df3 = pd.read_csv("geocoded_data_filled.csv")
total_nan_count = df3['Country'].isna().sum()
print(f"Total NaN values in 'Country' column: {total_nan_count}")

Total NaN values in 'Country' column: 211


In [65]:
missing_data = df3[df3["Country"].isna()]
print(missing_data[["Latitude", "Longitude"]])

       Latitude  Longitude
324    -33.9250   -52.9320
418     -4.4760   -36.4912
584     -2.9066   -38.9909
942     -2.9066   -38.9909
943     -2.9066   -38.9909
...         ...        ...
19605  -29.5386   114.5990
19606  -29.5386   114.5990
19661  -30.6558   114.8078
19671  -32.5601   115.4301
19673  -38.6301   142.5101

[211 rows x 2 columns]


There are still NaN values. I shall use another API to try and fill these.

In [66]:
import pandas as pd
import requests
import time

file_path = "geocoded_data_filled.csv"  
df = pd.read_csv(file_path)

# Getting country from latitude and longitude using BigDataCloud API
def get_country_bigdatacloud(lat, lon):
    url = f"https://api.bigdatacloud.net/data/reverse-geocode-client?latitude={lat}&longitude={lon}&localityLanguage=en"
    try:
        response = requests.get(url)
        data = response.json()
        return data.get("countryName", "Unknown")
    except Exception as e:
        print(f"Error fetching {lat}, {lon}: {e}")
    return "Unknown"

missing_indices = df[df['Country'].isna()].index

for i, idx in enumerate(missing_indices):
    lat, lon = df.loc[idx, ["Latitude", "Longitude"]]
    country = get_country_bigdatacloud(lat, lon)
    if country:
        df.at[idx, "Country"] = country
    time.sleep(1) 

output_path = "geocoded_data_final.csv"
df.to_csv(output_path, index=False)
print(f"Updated file saved as {output_path}")

Error fetching 55.21, 2.33: Expecting value: line 1 column 1 (char 0)
Updated file saved as geocoded_data_final.csv


In [67]:
df4 = pd.read_csv("geocoded_data_final.csv")

total_nan_count = df4['Country'].isna().sum()

print(f"Total NaN values in 'Country' column: {total_nan_count}")

Total NaN values in 'Country' column: 200


There are 200 values that are not matching to a Country name. This is likely due the wind turbines being located in the ocean or away from a countries coast line. Some API's may struggle to find these if they don't fall in country boundaries, therefore leading to NaN values.

For the final 200, we shall manually add the country name. To do this, I shall search the location of the wind turbine using its latitude and longitude and assign a country to it (or assign its nearest country if it is located in the ocean).

In [69]:
pd.set_option('display.max_rows', None)
missing_data = df4[df4["Country"].isna()]
print(missing_data[["Latitude", "Longitude"]])

       Latitude  Longitude
324    -33.9250   -52.9320
418     -4.4760   -36.4912
584     -2.9066   -38.9909
942     -2.9066   -38.9909
943     -2.9066   -38.9909
946    -31.7346   -50.6296
947    -31.7346   -50.6296
948    -31.7346   -50.6296
949    -31.7346   -50.6296
952    -21.3524   -40.2324
1090   -31.7346   -50.6296
1092    -2.9066   -38.9909
1131   -32.6187   -52.0601
1270   -22.0489   -40.7135
1914    39.1941   -73.3129
1915    39.2785   -74.1364
1916    40.1702   -73.3093
1931    41.0183   -70.7695
2081    46.6972  -124.4351
2135    36.8868   -75.4916
2288    28.8557   -94.5151
2289    28.3541   -94.7237
2290    38.6426   -74.7825
2308    35.4426  -121.8682
2413    40.2821   -73.4259
2414    40.8569  -124.7813
2415    40.9598  -124.5479
2483    36.4060   -75.3222
2496    29.3931   -93.5647
2511    39.0186   -74.1338
2557    43.4293   -69.0045
2634    36.0146  -123.5201
2635    35.5492  -121.5289
2699    39.1214   -74.2462
2710    47.7057  -125.3357
2836    40.7809  -124.7241
2

In [83]:
df5 = pd.read_csv('geocoded_data_final.csv')
df5.at[324, 'Country'] = 'Australia'
df5.to_csv('WindTurbineData_WithCountry.csv', index=False)

In [88]:
df5 = pd.read_csv('geocoded_data_final.csv')

lat_lon_to_country = {
    (-33.9250, -52.9320): 'Brazil',
    (-4.4760, -36.4912): 'Brazil',
    (-2.9066, -38.9909): 'Brazil',
    (-2.9066, -38.9909): 'Brazil',
    (-2.9066, -38.9909): 'Brazil',
    (-31.7346, -50.6296): 'Brazil',
    (-31.7346, -50.6296): 'Brazil',
    (-31.7346, -50.6296): 'Brazil',
    (-31.7346, -50.6296): 'Brazil',
    (-21.3524, -40.2324): 'Brazil',
    (-31.7346, -50.6296): 'Brazil',
    (-2.9066, -38.9909): 'Brazil',
    (-32.6187, -52.0601): 'Brazil',
    (-22.0489, -40.7135): 'Brazil',
    (39.1941, -73.3129): 'United States',
    (39.2785, -74.1364): 'United States',
    (40.1702, -73.3093): 'United States',
    (41.0183, -70.7695): 'United States',
    (46.6972, -124.4351): 'United States',
    (36.8868, -75.4916): 'United States',
    (28.8557, -94.5151): 'United States',
    (28.3541, -94.7237): 'United States',
    (38.6426, -74.7825): 'United States',
    (35.4426, -121.8682): 'United States',
    (40.2821, -73.4259): 'United States',
    (40.8569, -124.7813): 'United States',
    (40.9598, -124.5479): 'United States',
    (36.4060, -75.3222): 'United States',
    (29.3931, -93.5647): 'United States',
    (39.0186, -74.1338): 'United States',
    (43.4293, -69.0045): 'United States',
    (36.0146, -123.5201): 'United States',
    (35.5492, -121.5289): 'United States',
    (39.1214, -74.2462): 'United States',
    (47.7057, -125.3357): 'United States',
    (40.7809, -124.7241): 'United States',
    (41.0584, -71.2074): 'United States',
    (41.0471, -70.4874): 'United States',
    (22.7333, 118.0333): 'China',
    (22.6525, 117.4932): 'China',
    (18.7667, 108.3667): 'China',
    (39.4150, 123.3942): 'United States',
    (38.0798, 121.3540): 'United States',
    (37.3269, 124.6275): 'South Korea',
    (36.2630, 130.0280): 'South Korea',
    (36.1630, 125.6050): 'South Korea',
    (17.7493, 106.8826): 'Vietnam',
    (11.3603, 109.5507): 'Vietnam',
    (21.3077, 90.6633): 'Bangladesh',
    (54.9501, 16.1901): 'Poland',
    (55.6001, 17.6701): 'Poland',
    (55.0501, 17.6501): 'Poland',
    (54.8693, 16.0744): 'Poland',
    (55.0082, 16.3271): 'Poland',
    (55.5059, 17.5795): 'Poland',
    (55.0523, 17.0796): 'Poland',
    (54.4292, 15.2916): 'Poland',
    (55.3469, 17.1346): 'Poland',
    (54.4962, 15.4372): 'Poland',
    (54.4371, 15.5937): 'Poland',
    (55.0302, 17.6674): 'Poland',
    (55.0365, 16.4314): 'Poland',
    (56.3500, 7.4953): 'Germany',
    (55.6084, 7.6114): 'Germany',
    (54.9855, 12.9965): 'Germany',
    (55.8797, 6.5344): 'Germany',
    (55.8797, 6.5344): 'Germany',
    (58.2375, 20.6027): 'Latvia',
    (58.2287, 20.9649): 'Latvia',
    (58.1578, 21.0709): 'Latvia',
    (58.2239, 21.0794): 'Latvia',
    (65.0379, 23.9922): 'Finland',
    (63.7922, 21.9274): 'Finland',
    (51.8301, -6.4901): 'Ireland',
    (52.4836, -5.4648): 'Ireland',
    (51.5200, -7.2551): 'Ireland',
    (52.4610, -11.0610): 'Ireland',
    (51.2792, -8.8903): 'Ireland',
    (51.2792, -8.8903): 'Ireland',
    (51.4076, -7.1092): 'Ireland',
    (51.5700, -6.7400): 'Ireland',
    (57.1221, 20.5247): 'Latvia',
    (55.7012, 20.3006): 'Latvia',
    (61.3284, 2.2592): 'Sweden',
    (56.6477, 17.5332): 'Sweden',
    (55.6643, 15.5781): 'Sweden',
    (58.1607, 17.7001): 'Sweden',
    (58.0882, -2.9502): 'United Kingdom',
    (55.9673, 1.6679): 'United Kingdom',
    (55.9673, 1.6679): 'United Kingdom',
    (55.9673, 1.6679): 'United Kingdom',
    (51.2018, -6.2691): 'United Kingdom',
    (51.2018, -6.2691): 'United Kingdom',
    (51.2018, -6.2691): 'United Kingdom',
    (52.1370, 2.1728): 'United Kingdom',
    (52.1370, 2.1728): 'United Kingdom',
    (52.1370, 2.1728): 'United Kingdom',
    (51.8810, 2.0401): 'United Kingdom',
    (51.9176, 1.9284): 'United Kingdom',
    (51.0253, -6.5322): 'United Kingdom',
    (58.8618, -5.6100): 'United Kingdom',
    (53.6815, 1.4207): 'United Kingdom',
    (53.9601, 1.5400): 'United Kingdom',
    (51.6217, 1.4958): 'United Kingdom',
    (58.1621, -0.9218): 'United Kingdom',
    (57.9673, -2.7905): 'United Kingdom',
    (57.9673, -2.7905): 'United Kingdom',
    (57.9673, -2.7905): 'United Kingdom',
    (58.0804, -3.1226): 'United Kingdom',
    (56.6355, -1.9266): 'United Kingdom',
    (59.1559, -7.7598): 'United Kingdom',
    (58.7948, -6.0515): 'United Kingdom',
    (53.2123, 0.8616): 'United Kingdom',
    (54.0910, -3.7410): 'United Kingdom',
    (37.6487, 12.1193): 'Italy',
    (40.1438, 10.3501): 'Italy',
    (44.1930, 13.5763): 'Italy',
    (44.1930, 13.5763): 'Italy',
    (37.4612, 11.8770): 'Italy',
    (41.2107, 9.8571): 'Italy',
    (41.2767, 10.0032): 'Italy',
    (44.4518, 12.6917): 'Italy',
    (41.5026, -9.1917): 'Portugal',
    (41.6510, -9.3060): 'Portugal',
    (44.0352, -4.5247): 'Portugal',
    (28.5624, -13.3001): 'Canary Islands',
    (43.9530, -8.3527): 'Spain',
    (51.6484, 2.8349): 'United Kingdom',
    (51.5400, 2.9400): 'United Kingdom',
    (51.5600, 2.9700): 'United Kingdom',
    (51.5400, 2.9200): 'United Kingdom',
    (51.6484, 2.8349): 'United Kingdom',
    (51.6190, 2.9010): 'United Kingdom',
    (51.5900, 2.9400): 'United Kingdom',
    (51.6300, 2.8600): 'United Kingdom',
    (51.7200, 2.7400): 'United Kingdom',
    (51.6860, 2.7570): 'United Kingdom',
    (50.1876, 1.0274): 'United Kingdom',
    (42.8469, 3.4718): 'France',
    (46.2335, -2.6015): 'France',
    (43.0286, 3.4177): 'France',
    (45.8912, -1.7802): 'France',
    (54.4868, 6.2832): 'Germany',
    (54.0167, 6.6000): 'Germany',
    (54.0167, 6.6000): 'Germany',
    (54.5200, 7.7080): 'Germany',
    (54.7800, 14.1200): 'Germany',
    (54.3550, 5.9800): 'Germany',
    (54.8506, 13.8014): 'Germany',
    (53.9741, 6.4912): 'Germany',
    (53.9665, 6.5493): 'Germany',
    (53.9667, 6.5623): 'Germany',
    (53.9667, 6.4956): 'Germany',
    (54.0470, 6.2340): 'Germany',
    (55.0190, 7.7740): 'Germany',
    (55.1400, 7.2000): 'Germany',
    (54.3036, 5.7894): 'Germany',
    (54.9998, 13.1999): 'Germany',
    (54.5000, 6.3580): 'Germany',
    (54.0160, 6.9830): 'Germany',
    (54.0750, 7.0070): 'Germany',
    (54.4393, 6.3327): 'Germany',
    (54.4900, 7.7000): 'Germany',
    (54.1388, 7.1687): 'Germany',
    (54.4020, 7.7070): 'Germany',
    (54.0564, 6.5536): 'Germany',
    (54.4440, 7.6820): 'Germany',
    (54.4440, 7.6820): 'Germany',
    (55.1900, 6.8600): 'Germany',
    (54.0417, 6.4667): 'Germany',
    (54.0417, 6.4667): 'Germany',
    (54.3210, 5.8600): 'Germany',
    (54.8340, 14.0680): 'Germany',
    (52.4661, 3.9651): 'Germany',
    (51.7101, 3.0001): 'Germany',
    (51.6401, 3.0701): 'Germany',
    (52.6878, 4.2486): 'Germany',
    (52.6878, 4.2486): 'Germany',
    (52.4393, 3.4169): 'Germany',
    (52.9032, 4.3268): 'Germany',
    (52.4032, 4.1652): 'Germany',
    (53.7190, 5.9811): 'Germany',
    (52.1453, 3.4577): 'Germany',
    (52.6705, 3.9456): 'Germany',
    (52.6705, 3.9456): 'Germany',
    (52.3717, 3.8833): 'Germany',
    (52.5943, 4.2125): 'Germany',
    (-36.7301, 139.3193): 'Australia',
    (-35.2024, 137.7715): 'Australia',
    (-36.3282, 138.5374): 'Australia',
    (-37.7049, 144.6111): 'Australia',
    (-37.5000, 144.3000): 'Australia',
    (-32.9900, 151.7600): 'Australia',
    (-29.1177, 152.7793): 'Australia',
    (-31.9745, 150.5506): 'Australia',
    (-32.2263, 151.2081): 'Australia',
    (-33.0409, 151.7620): 'Australia',
    (-32.0764, 150.8939): 'Australia',
    (-37.2137, 141.0372): 'Australia',
    (-38.2499, 145.2341): 'Australia',
    (-35.7357, 140.8990): 'Australia',
    (-37.3932, 141.4569): 'Australia',
    (-34.9369, 151.6795): 'Australia',
    (-37.5914, 144.9490): 'Australia',
    (-38.1744, 145.5523): 'Australia',
    (-39.4521, 141.4977): 'Australia',
    (-38.3652, 145.8722): 'Australia',
    (-33.8610, 151.2110): 'Australia',
    (-38.8799, 147.0503): 'Australia',
    (-38.8003, 142.4555): 'Australia',
    (-32.8736, 152.4788): 'Australia',
    (-37.3352, 150.3932): 'Australia',
    (-38.8647, 146.9758): 'Australia',
    (-33.0239, 152.1731): 'Australia',
    (-39.0803, 145.5356): 'Australia',
    (-29.5386, 114.5990): 'Australia',
    (-29.5386, 114.5990): 'Australia',
    (-29.5386, 114.5990): 'Australia',
    (-29.5386, 114.5990): 'Australia',
    (-30.6558, 114.8078): 'Australia'

}

def fill_country_manually(row):
    lat_lon = (row['Latitude'], row['Longitude'])
    # Only change the 'Country' if the coordinates exist in the dictionary and 'Country' is NaN
    if lat_lon in lat_lon_to_country:
        if pd.isna(row['Country']):
            return lat_lon_to_country[lat_lon]
    return row['Country']  # Don't change if the country is already filled or coordinates don't match

# Apply the function to only change rows with NaN in the 'Country' column
df5['Country'] = df5.apply(fill_country_manually, axis=1)

# Save the updated DataFrame to CSV
df5.to_csv('WindTurbineData_WithCountry.csv', index=False)

print("The DataFrame has been saved as 'WindTurbineData_WithCountry.csv'.")

The DataFrame has been saved as 'WindTurbineData_WithCountry.csv'.


In [89]:
df6 = pd.read_csv("WindTurbineData_WithCountry.csv")
total_nan_count = df6['Country'].isna().sum()
print(f"Total NaN values in 'Country' column: {total_nan_count}")

Total NaN values in 'Country' column: 0
