# Area of Interest
In this section, will collect data of all the station located in 13 counties of interest. There are 44 precipitation stations in the region and time of interest.


## FTP Connection

### Connection Parameters

In [None]:
server = "opendata.dwd.de"
user   = "anonymous"
passwd = ""

### FTP Directory Definition and Station Description Filename Pattern

In [None]:
# The topic of interest.
topic_dir = "/hourly/precipitation/historical/"

# This is the search pattern common to ALL station description file names 
station_desc_pattern = "_Beschreibung_Stationen.txt"

# Below this directory tree node all climate data are stored.
ftp_climate_data_dir = "/climate_environment/CDC/observations_germany/climate/"
ftp_dir =  ftp_climate_data_dir + topic_dir

### Local Directories

In [None]:
local_ftp_dir         = "../data/original/DWD/"      # Local directory to store local ftp data copies, the local data source or input data. 
local_ftp_station_dir = local_ftp_dir + topic_dir # Local directory where local station info is located
local_ftp_ts_dir      = local_ftp_dir + topic_dir # Local directory where time series downloaded from ftp are located

local_generated_dir   = "../data/generated/DWD/" # The generated of derived data in contrast to local_ftp_dir
local_station_dir     = local_generated_dir + topic_dir # Derived station data, i.e. the CSV file
local_ts_merged_dir   = local_generated_dir + topic_dir # Parallelly merged time series, wide data frame with one TS per column
local_ts_appended_dir = local_generated_dir + topic_dir # Serially appended time series, long data frame for QGIS TimeManager Plugin

print(local_ftp_dir)
print(local_ftp_station_dir)
print(local_ftp_ts_dir)
print()
print(local_generated_dir)
print(local_station_dir)
print(local_ts_merged_dir)
print(local_ts_appended_dir)

In [4]:
import os
os.makedirs(local_ftp_dir,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_ftp_station_dir,exist_ok = True) # it does not complain if the dir already exists. # it does not complain if the dir already exists.
os.makedirs(local_ftp_ts_dir,exist_ok = True) # it does not complain if the dir already exists.

os.makedirs(local_generated_dir,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_station_dir,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_ts_merged_dir,exist_ok = True) # it does not complain if the dir already exists.
os.makedirs(local_ts_appended_dir,exist_ok = True) # it does not complain if the dir already exists.

### FTP Connect

In [5]:
import ftplib
ftp = ftplib.FTP(server)
response = ftp.login(user=user, passwd = passwd)
print(response)

230 Login successful.


In [6]:
ret = ftp.cwd(".")

### FTP Grab File Function

In [7]:
def grabFile(ftpfullname,localfullname):
    try:
        ret = ftp.cwd(".") # A dummy action to chack the connection and to provoke an exception if necessary.
        localfile = open(localfullname, 'wb')
        ftp.retrbinary('RETR ' + ftpfullname, localfile.write, 1024)
        localfile.close()
    
    except ftplib.error_perm:
        print("FTP ERROR. Operation not permitted. File not found?")

    except ftplib.error_temp:
        print("FTP ERROR. Timeout.")

    except ConnectionAbortedError:
        print("FTP ERROR. Connection aborted.")



### Generate Pandas Dataframe from FTP Directory Listing

In [8]:
import pandas as pd
import os

def gen_df_from_ftp_dir_listing(ftp, ftpdir):
    lines = []
    flist = []
    try:    
        res = ftp.retrlines("LIST "+ftpdir, lines.append)
    except:
        print("Error: ftp.retrlines() failed. ftp timeout? Reconnect!")
        return
        
    if len(lines) == 0:
        print("Error: ftp dir is empty")
        return
    
    for line in lines:
#        print(line)
        [ftype, fsize, fname] = [line[0:1], int(line[31:42]), line[56:]]
#        itemlist = [line[0:1], int(line[31:42]), line[56:]]
#        flist.append(itemlist)
        
        fext = os.path.splitext(fname)[-1]
        
        if fext == ".zip":
            station_id = int(fname.split("_")[2])
        else:
            station_id = -1 
        
        flist.append([station_id, fname, fext, fsize, ftype])
        
        

    df_ftpdir = pd.DataFrame(flist,columns=["station_id", "name", "ext", "size", "type"])
    return(df_ftpdir)

In [9]:
# Generate a pandas dataframe from the FTP directory listing 
df_ftpdir = gen_df_from_ftp_dir_listing(ftp, ftp_dir)

In [10]:
df_ftpdir.head(10)

Unnamed: 0,station_id,name,ext,size,type
0,-1,BESCHREIBUNG_obsgermany_climate_hourly_precipi...,.pdf,166317,-
1,-1,DESCRIPTION_obsgermany_climate_hourly_precipit...,.pdf,161348,-
2,-1,RR_Stundenwerte_Beschreibung_Stationen.txt,.txt,303615,-
3,3,stundenwerte_RR_00003_19950901_20110401_hist.zip,.zip,418905,-
4,20,stundenwerte_RR_00020_20040814_20211231_hist.zip,.zip,456263,-
5,44,stundenwerte_RR_00044_20070401_20211231_hist.zip,.zip,378416,-
6,53,stundenwerte_RR_00053_20051001_20211231_hist.zip,.zip,409591,-
7,71,stundenwerte_RR_00071_20041022_20200101_hist.zip,.zip,402406,-
8,73,stundenwerte_RR_00073_20070401_20211231_hist.zip,.zip,380526,-
9,78,stundenwerte_RR_00078_20041101_20211231_hist.zip,.zip,445888,-


### Dataframe with TS Zip Files

In [11]:
#df_ftpdir["ext"]==".zip"
df_zips = df_ftpdir[df_ftpdir["ext"]==".zip"]
df_zips.set_index("station_id", inplace = True)
df_zips.head(10)

Unnamed: 0_level_0,name,ext,size,type
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,stundenwerte_RR_00003_19950901_20110401_hist.zip,.zip,418905,-
20,stundenwerte_RR_00020_20040814_20211231_hist.zip,.zip,456263,-
44,stundenwerte_RR_00044_20070401_20211231_hist.zip,.zip,378416,-
53,stundenwerte_RR_00053_20051001_20211231_hist.zip,.zip,409591,-
71,stundenwerte_RR_00071_20041022_20200101_hist.zip,.zip,402406,-
73,stundenwerte_RR_00073_20070401_20211231_hist.zip,.zip,380526,-
78,stundenwerte_RR_00078_20041101_20211231_hist.zip,.zip,445888,-
87,stundenwerte_RR_00087_20050201_20211231_hist.zip,.zip,426930,-
91,stundenwerte_RR_00091_20040901_20211231_hist.zip,.zip,443441,-
96,stundenwerte_RR_00096_20190409_20211231_hist.zip,.zip,79249,-


### Download the Station Description File

In [12]:
station_fname = df_ftpdir[df_ftpdir['name'].str.contains(station_desc_pattern)]["name"].values[0]
print(station_fname)



RR_Stundenwerte_Beschreibung_Stationen.txt


In [13]:
print("grabFile: ")
print("From: " + ftp_dir + station_fname)
print("To:   " + local_ftp_station_dir + station_fname)
grabFile(ftp_dir + station_fname, local_ftp_station_dir + station_fname)

grabFile: 
From: /climate_environment/CDC/observations_germany/climate//hourly/precipitation/historical/RR_Stundenwerte_Beschreibung_Stationen.txt
To:   ../data/original/DWD//hourly/precipitation/historical/RR_Stundenwerte_Beschreibung_Stationen.txt


In [14]:
# extract column names. They are in German (de)
# We have to use codecs because of difficulties with character encoding (German Umlaute)
import codecs

def station_desc_txt_to_csv(txtfile, csvfile):
    file = codecs.open(txtfile,"r","utf-8")
    r = file.readline()
    file.close()
    colnames_de = r.split()
    colnames_de
    
    translate = \
    {'Stations_id':'station_id',
     'von_datum':'date_from',
     'bis_datum':'date_to',
     'Stationshoehe':'altitude',
     'geoBreite': 'latitude',
     'geoLaenge': 'longitude',
     'Stationsname':'name',
     'Bundesland':'state'}
    
    colnames_en = [translate[h] for h in colnames_de]
    
    # Skip the first two rows and set the column names.
    df = pd.read_fwf(txtfile,skiprows=2,names=colnames_en, parse_dates=["date_from","date_to"],index_col = 0)
    
    # write csv
    df.to_csv(csvfile, sep = ";")
    return(df)

In [15]:
basename = os.path.splitext(station_fname)[0]
dataframe_stations = station_desc_txt_to_csv(local_ftp_station_dir + station_fname, local_station_dir + basename + ".csv")
dataframe_stations.head(5)

Unnamed: 0_level_0,date_from,date_to,altitude,latitude,longitude,name,state
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
20,2004-08-14,2022-04-05,432,48.9219,9.9129,Abtsgm�nd-Untergr�ningen,Baden-W�rttemberg
29,2006-01-10,2022-04-05,260,49.7175,10.9101,Adelsdorf (Kl�ranlage),Bayern
44,2007-04-01,2022-04-05,44,52.9336,8.237,Gro�enkneten,Niedersachsen
46,2006-01-03,2022-04-05,325,48.945,12.4639,Aholfing,Bayern


In [16]:
dataframe_stations

Unnamed: 0_level_0,date_from,date_to,altitude,latitude,longitude,name,state
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,1995-09-01,2011-04-01,202,50.7827,6.0941,Aachen,Nordrhein-Westfalen
20,2004-08-14,2022-04-05,432,48.9219,9.9129,Abtsgm�nd-Untergr�ningen,Baden-W�rttemberg
29,2006-01-10,2022-04-05,260,49.7175,10.9101,Adelsdorf (Kl�ranlage),Bayern
44,2007-04-01,2022-04-05,44,52.9336,8.2370,Gro�enkneten,Niedersachsen
46,2006-01-03,2022-04-05,325,48.9450,12.4639,Aholfing,Bayern
...,...,...,...,...,...,...,...
19361,2021-10-29,2022-04-05,540,50.4159,11.0437,Frankenblick-Rauenstein,Th�ringen
19362,2021-10-29,2022-04-05,720,50.4602,11.0858,Neuhaus-Steinheid (Th�r.),Th�ringen
19363,2022-03-21,2022-04-05,409,48.9532,10.9135,Treuchtlingen,Bayern
19366,2021-11-29,2022-04-05,242,49.1505,7.9174,Annweiler am Trifels-Stein,Rheinland-Pfalz
