In [1]:
import os
import pandas as pd

## read data of Windkraftanlagen

In [2]:
Windanlagen = "../data/Windkraftanlagen/_Onshore_Windkraftanlagen_in_Deutschland.csv"
df = pd.read_csv(Windanlagen)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   X                         5194 non-null   float64
 1   Y                         5194 non-null   float64
 2   FID                       5208 non-null   int64  
 3   Bundesland                5208 non-null   object 
 4   Name                      5208 non-null   object 
 5   Baujahr                   4668 non-null   object 
 6   Gesamtleistung__MW_       2634 non-null   object 
 7   Anzahl                    2637 non-null   object 
 8   Typ__WKA_                 4569 non-null   object 
 9   Ort                       3248 non-null   object 
 10  Landkreis                 2683 non-null   object 
 11  Breitengrad               5194 non-null   float64
 12  Längengrad                5194 non-null   float64
 13  Projektierer___Betreiber  2210 non-null   object 
 14  Bemerkun

## 

In [3]:
#drop windparks without X and Y coordinates
df.dropna(axis=0, subset=["X", "Y"], inplace=True)

unusefulCols = ["Typ__WKA_", "Projektierer___Betreiber", "Bemerkungen", "Breitengrad", "Längengrad", "Landkreis", "Ort"]
df.drop(columns=unusefulCols, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5194 entries, 0 to 5193
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   X                    5194 non-null   float64
 1   Y                    5194 non-null   float64
 2   FID                  5194 non-null   int64  
 3   Bundesland           5194 non-null   object 
 4   Name                 5194 non-null   object 
 5   Baujahr              4654 non-null   object 
 6   Gesamtleistung__MW_  2621 non-null   object 
 7   Anzahl               2624 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 365.2+ KB


In [5]:
#convert Anzahl, Baujahr and Gesamtleistung to floats
for exp in ["unklar", "9[36]", "5[51]"]:
    df.Anzahl.replace({exp:None}, inplace=True)
for key, value in {"2013–2014[33]":2014, "2015–2017":2017, "1990–1991":1991, "2017–2018":2018, "1994–1995":1995, "2015/2016":2016, "1997-2001":2001, "2013/2017–2018":2018, "2002 2005":2005, "2005-2006":2006, "2005-2007":2007, "2004-2005":2005, "2016–2017":2017, "2002–2003":2003, "2008-2010":2010, "2015–2016":2016,"2012–2013":2013, "2004–2005":2005, "2010–2011":2011, "2007–2008":2008, "2017-2018":2018, "1999–2001":2001, "2003–2004":2004, "1997–1998":1998, "2001–2002":2002, "1996–1997":1997, "2011–2012":2012, "1999–2000":2000, "1998–1999":1999, "2010–2016":2016, "2008–2009":2009, "2014–2015":2015, "2005–2006": 2006, "2009–2010":2010, "2006–2007":2007, "2000–2001":2001, "2013–2014":2014}.items():
    df.Baujahr.replace({key:value}, inplace=True)

df.Gesamtleistung__MW_ = df.Gesamtleistung__MW_.str.replace(',','.')
for key, value in {"35,45":80, "unklar":None, "27.0[34][35]":27}.items():
    df.Gesamtleistung__MW_.replace({key:value}, inplace=True)  

df = df.astype({"Anzahl":float, "Baujahr":float, "Gesamtleistung__MW_":float})

## Remove rows with duplicate locations values

In [6]:
#create string with X and Y coordinates
df["loc"]= df.X.astype(str)+ ", "+ df.Y.astype(str)

In [7]:
#create new data frame with the combined values for each location
ndf = None
for x in df["loc"].unique():
    #print(x,y)
    df1 = df[(df["loc"]==x)]
    #print(len(df1))
    if len(df1) > 1:
        #combine the values
        anzahl = df1.Anzahl.sum()
        leistung = df1.Gesamtleistung__MW_.sum()
        baujahr = max(df1.Baujahr.to_list())
        
        #remove duplicates and set new values
        df1 = df1.drop_duplicates(subset=["loc"])
        df1["Baujahr"] = baujahr
        df1.Gesamtleistung__MW_ = leistung
        df1.Anzahl = anzahl
        
        #add new entry to ndf
        if ndf is None:
            ndf = df1
        else:
            ndf = ndf.append(df1)
    else:
        if ndf is None:
            ndf = df1
        else:
            ndf = ndf.append(df1)
    

In [8]:
#remove unused variables
del df, df1

In [9]:
#drop the location string
ndf.drop(columns=["loc"], inplace=True)

In [10]:
ndf = ndf.reset_index()

In [16]:
ndf.to_csv("../prep_data/Windkraftanlagen.csv", index=False)