# Get Geodata from Wikipedia

## Example 1: Get Nuclear Power Plants (in service)

https://en.wikipedia.org/wiki/List_of_nuclear_power_stations

In [1]:
import pandas as pd

df = pd.read_html("https://en.wikipedia.org/wiki/List_of_nuclear_power_stations")

print(len(df))
      

8


In [2]:
df[2]

Unnamed: 0,Power station,No. of units,Net capacity under construction (MW),Construction start,Planned connection,Country,Location
0,Akkuyu,4,4456,2015,2023,Turkey,36°08′40″N 33°32′28″E﻿ / ﻿36.14444°N 33.54111°E
1,El Dabaa,1,1194,2022,2026,Egypt,31°2′39″N 28°29′52″E﻿ / ﻿31.04417°N 28.49778°E
2,Hinkley Point C,2,3300,2018,2027,United Kingdom,51°12′22″N 3°8′38″W﻿ / ﻿51.20611°N 3.14389°W
3,Ōma,1,1325,2010,2028,Japan,41°30′35″N 140°54′37″E﻿ / ﻿41.50972°N 140.91028°E
4,Rooppur,2,2160,2017,2024,Bangladesh,24°4′0″N 89°2′50″E﻿ / ﻿24.06667°N 89.04722°E
5,San’ao[18][19][20],2,2234,2020,2026,China,27°12′5″N 120°30′56″E﻿ / ﻿27.20139°N 120.51556°E
6,Xudabao,2,2400,2021,2028,China,40°21′5″N 120°32′47″E﻿ / ﻿40.35139°N 120.54639°E
7,Vogtle (expansion),2,2234,2012,2024,United States,40°21′5″N 120°32′47″E﻿ / ﻿40.35139°N 120.54639°E
8,Zhangzhou,2,2252,2019,2024,China,23°49′45″N 117°29′30″E﻿ / ﻿23.82917°N 117.49167°E


In [3]:
df[1].to_csv("geodata/powerplants.csv")

In [4]:
def removenote(s,k=120):
    for i in range(1,k+1):
        s = s.replace(f"[note {i}]","")
        s = s.replace(f"[dp {i}]","")
    return s

In [5]:
def numbersonly(s):
    return "".join(ch for ch in s if ch in "0123456789 .")

In [7]:
import string
import csv

file = open("geodata/powerplants.csv", encoding="utf-8")
outfile = open("geodata/powerplants2023.csv", "w", encoding="utf-8")

outfile.write("id,name,units,capacity,country,geometry\n")

reader = csv.reader(file)
next(reader)

for line in reader:  
    uid = line[0]
    name = line[1]
    units = removenote(line[2])
    capacity = removenote(line[3])
    capacity = capacity.replace(",",".")
    country = line[4]
    
    pos = line[5]
    pos = numbersonly(pos)
    while pos.startswith(".") or pos.startswith(" "):
        pos = pos[1:]
   
    lnglat = pos.split(" ")
    lat = float(lnglat[3])
    lng = float(lnglat[4])
    geometry = f"POINT ({lng} {lat})"
        
    outfile.write(f"{uid},{name},{units},{capacity},{country},{geometry}\n")

file.close()
outfile.close()

In [8]:
import geopandas as gpd
import pandas as pd
from shapely import wkt

df = pd.read_csv("geodata/powerplants2023.csv", encoding="utf-8")

df['geometry'] = df['geometry'].apply(wkt.loads)

gdf = gpd.GeoDataFrame(df, geometry="geometry")

In [9]:
gdf.query('country == "Czech Republic"')

Unnamed: 0,id,name,units,capacity,country,geometry
48,48,Dukovany,4,1878.0,Czech Republic,POINT (16.14889 49.08500)
155,155,Temelín,2,2056.0,Czech Republic,POINT (14.37611 49.18000)


Export to GeoJSON:

In [10]:
geojson = gdf.to_json()
file = open("geodata/powerplants2023.json", "w", encoding="utf-8")
file.write(geojson)
file.close()

## Example 2: Get Highest Mountain Peaks

In [12]:
df = pd.read_html("https://en.wikipedia.org/wiki/List_of_highest_mountains_on_Earth")
print(len(df))

3


In [13]:
df[2].to_csv("geodata/mountainpeaks.csv")  # index may change...

In [14]:
import string
import csv

file = open("geodata/mountainpeaks.csv", encoding="utf-8")
outfile = open("geodata/mountainpeaks2023.csv", "w", encoding="utf-8")

outfile.write("id,name,height,country,geometry\n")

reader = csv.reader(file)
next(reader)
next(reader)
next(reader)

for line in reader:  
    uid = line[0]
    name = line[2]
    height = removenote(line[3])
    country = removenote(line[13])
    country = country.replace(",","/")

    pos = line[8]
    pos = numbersonly(pos)
    while pos.startswith(".") or pos.startswith(" "):
        pos = pos[1:]
        
    lnglat = pos.split(" ")
    lat = float(lnglat[3])
    lng = float(lnglat[4])
    geometry = f"POINT ({lng} {lat})"
        
    outfile.write(f"{uid},{name},{height},{country},{geometry}\n")

file.close()
outfile.close()