In [8]:
import requests
import os
import re
import json
import csv
import pandas as pd
import numpy as np
import math
import html
pd.set_option("display.min_rows", 50)

In [9]:
# Save my persona; GENESIS username and password
username = 'DE7UN14YCR'
password = 'jydhit-fiZja0'

In [10]:
# Does a topics file exist?
try:
    topics = pd.read_csv('topics.csv')

# If not, create one
except:
    
    topicsURL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/catalogue/statistics?username={}&password={}&selection=*&area=all&&pagelength=25000&language=en'.format(username, password)
    topics = [item['Code'] for item in requests.get(topicsURL).json()['List']]
    df = pd.DataFrame(topics, columns = ['Topic'])
    df.to_csv('topics.csv', index = False)
    
topics = list(topics['Topic'])

In [11]:
print ('There are {} topics. Here are the first 3:{}{}'.format(len(topics), '\n', topics[:3]))

There are 306 topics. Here are the first 3:
[11111, 12111, 12211]


In [12]:
def Timeseries (topic):
        
    timeseriesURL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/catalogue/tables?username={}&password={}&selection={}*&area=all&pagelength=2500&language=en'.format(username, password, topic)
    timeseries = [[item['Code'],item['Content']] for item in requests.get(timeseriesURL).json()['List'] if item['Time'] and item['Time'].split(' to ')[0] != item['Time'].split(' to ')[1]]
    
    timeDims = [['Year'], ['Reference month'], ['Reference date'],'Year','Reference month','Reference date']
    
    result = []
    for item in timeseries:
        
        transpose = []
    
        metadata = 'https://www-genesis.destatis.de/genesisWS/rest/2020/metadata/table?username={}&password={}&name={}&area=all&language=en'.format(username, password, item[0])
        metadata = requests.get(metadata).json()['Object']

        rows = metadata['Structure']['Rows']
        columns = metadata['Structure']['Columns']

        # If the metadata contains row information:
        rowItems = []

        if rows:
            # Parse individual row names
            rowItems.append([row['Content'] for row in rows])

        else:
            rowItems.append('-')

        rowItems = rowItems[0]

        columnItems = []

        # If the metadata contains column information:
        if columns:
            columnItems.append([column['Content'] for column in columns])

        else:
            columnItems.append('-')

        columnItems = columnItems[0]

        rowsColumns = rowItems + columnItems
        
        if not any(name in rowsColumns for name in timeDims):
            continue
            
        elif any(name in columnItems for name in timeDims):
            result += [[item[0],item[1],False]]
            continue 
            
        else:
            result += [[item[0],item[1],True]]

    return result    

In [13]:
# WARNING - THIS CELL TAKES >1 HOUR TO RUN

# Does a catalog file exist?
try:
    catalog = pd.read_csv('catalog.csv')

# If not, create one
except:
    
    catalog = []
    
    for topic in topics:
    
        timeseriesURL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/catalogue/tables?username={}&password={}&selection={}*&area=all&pagelength=2500&language=en'.format(username, password, topic)
        catalog += Timeseries (topic)
        print (topic)

    df = pd.DataFrame(catalog, columns = ['ID','Name','Transpose'])
    df.to_csv('catalog.csv', index = False)

In [18]:
print ('There are {} tables. Here are the first 5:{}{}'.format(len(catalog), '\n', catalog.head()))

There are 1175 tables. Here are the first 5:
           ID                                               Name  Transpose
0  11111-0001          Size of territory: Länder, reference date      False
1  11111-0002  Size of territory: Administrative districts, r...      False
2  12211-9000  Population, persons in employment, unemployed ...       True
3  12211-9001  Population, persons in employment, unemployed ...       True
4  12211-9002  Population, persons in employment, unemployed ...       True


In [662]:
### AUXILLIARY FUNCTIONS ###

def Metadata (item):
    
    # Metadata URLs in English and German (used later for translating row labels)
    metadata = 'https://www-genesis.destatis.de/genesisWS/rest/2020/metadata/table?username={}&password={}&name={}&area=all&language={}'.format(username, password, item, 'en')
    metadataDe = 'https://www-genesis.destatis.de/genesisWS/rest/2020/metadata/table?username={}&password={}&name={}&area=all&language={}'.format(username, password, item, 'de')
    
    # Call both URLs
    metadata = requests.get(metadata).json()['Object']
    metadataDe = requests.get(metadataDe).json()['Object']
    
    # Define a dictionary for metadata information
    Index = {'Resource ID':'', 'Resource Name':'', 'Date From':'', 'Date To':'', 'Rows':'', 'Columns':''}
    
    # Input 4 of 6 metadata
    Index['Resource ID'] = metadata['Code']
    Index['Resource Name'] = metadata['Content']
    Index['Date From'] = metadata['Time']['From']
    Index['Date To'] = metadata['Time']['To']
    
    items = []
    rows = metadata['Structure']['Rows']
    
    # If the metadata contains row information:
    if rows:
        
        # Parse individual row names
        items.append([item['Content'] for item in rows])
        
        # Parse individual row names in German
        rowsDe = [item['Content'] for item in metadataDe['Structure']['Rows']]
    
    else:
        items.append('-')
    
    # Append row information to the Index dictionary
    Index['Rows'] = items[0]
    
    # Create a translation dictionary to translate German row labels into English (where possible)
    translation = dict(zip(rowsDe, Index['Rows']))
                     
    items = []
    columns = metadata['Structure']['Columns']
    
    # If the metadata contains column information:
    if columns:
        items.append([item['Content'] for item in columns])
    
    else:
        items.append('-')

    Index['Columns'] = items[0]
    
    # Create a DataFrame with the extracted metadata information
    df = pd.DataFrame.from_dict(Index, orient = 'index', columns = [''])
    
    # Return the df, and the translation dictionary
    return (df, translation)


# Function to splice a DataFrame to exclude metadata, such as the organisation's name or databank's name
def DF (df):

    # Starting from the first row
    beginning = 0
    
    # Iterate through each row
    for row in df.itertuples():
        
        # If the first column in that row IS null, this indicates that this row contains column names
        if pd.isnull(row[1]):
            break
        
        # If the first column is NOT null, this indicates that this row contains metadata
        else:
            # Increment beginning by 1, and continue the loop until the first column IS null
            beginning += 1
            continue
            
    # The last rows of the dataset may or may not contain metadata, so we repeat this process
    end = len(df)
    
    # Iterate through each row from the end of the DataFrame
    for row in df[::-1].itertuples():
        
        # If the first column IS null, it does not contain a row label
        if pd.isnull(row[-1]):
            
            # Decrement end by 1, and continue the loop until the first column is NOT null
            end -= 1
            continue
        
        else:
            break
    
    # Return the spliced DataFrame, removing other NaN values
    df = df.iloc[beginning:end,:]
    df = df.replace(np.nan, '')
    
    return df

def RowFixer (row):
    
    replacement = ''
    
    for index in range(len(row)):
        
        if pd.isnull(row[index]):
            row[index] = replacement
        
        else:
            replacement = row[index]
    
    return row

def ColumnFixer (df):
    
    drops = []
    
    for index,row in df.iterrows():
        
        if pd.isnull(row[0]):
            
            df.iloc[index,:] = RowFixer(row)
            
    df = df.reset_index(drop=True)
                                    
    for index,row in df.iterrows():
        
        if len(set(row)) == 2:
            df.drop(index, inplace = True)
            
    df = df.reset_index(drop=True)
                
    for column in [column for column in df.columns if column != '']:
        
        try:
            df[column] = pd.to_numeric(df[column])
        
        except:
            pass
        
        try:
            df[column] = df[column].str.replace(',','.')
            df[column] = df[column].replace('-','').replace('.','')
            df[column] = pd.to_numeric(df[column])
        
        except:
            pass
    
    multiIndex = []
    Index = 0
    
    for index,row in df.iterrows():
        
        if df.iloc[index,0] == '':
            
            multiIndex.append(list(row))
            Index+=1
            
            if Index == 2:
                
                multiIndex = pd.MultiIndex.from_arrays(multiIndex)
                df.columns = multiIndex

                return df.iloc[Index:,:].reset_index(drop=True)
            
    return df.iloc[Index:,:].reset_index(drop=True)

In [642]:
def CSV (item, metadata, translation, transpose):
        
    # URL of the timeseries
    resourceURL = 'https://www-genesis.destatis.de/genesisWS/rest/2020/data/tablefile?username={}&password={}&name={}&area=all&transpose={}&startyear=1900&endyear=2100&format=csv&language=en'.format(username,password,item,transpose)    
    
    df = pd.read_csv (resourceURL, delimiter = ';').reset_index()
    
    rows = metadata.loc['Rows',:][0]
    
    df = DF (df)
    
    Index = []
    
    for item in df.iloc[:,0]:
        
        try:
            item = item.replace('nachr.: ','')
            item = translation[item.strip()]
            Index.append(item)
        
        except KeyError:
            Index.append(item.strip())
                
    df.iloc[:,0] = Index     
    df.columns = df.iloc[0,:]
    df = df.iloc[1:,:].reset_index().drop(columns = ['index'])
    
    return ColumnFixer(df)

In [664]:
for timeseries in catalog.iterrows():
    
    topic = timeseries[1][0].split('-')[0]
    
    try:
        metadata,translation = Metadata (timeseries[1][0])
        df = CSV(timeseries[1][0], metadata, translation, timeseries[1][2])
    
    except Exception as exception:
        print (timeseries[1][0],exception)
        continue

    path = os.path.join(os.getcwd(), topic, timeseries[1][0])

    if not os.path.exists(path):
        os.makedirs(path)

    csvpath = '{}/data.csv'.format(path)
    df.to_csv(csvpath, index = False)

    metadatapath = '{}/metadata.csv'.format(path)
    metadata.to_csv(metadatapath, index = True)

    print (timeseries[1][0],'-',path)

11111-0001 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/11111/11111-0001
11111-0002 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/11111/11111-0002
12211-9000 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9000
12211-9001 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9001
12211-9002 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9002
12211-9003 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9003
12211-9004 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9004
12211-9005 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9005
12211-9006 - /Users/muditsharma/Documents/Knoema/Federal Statistical Office (Destatis)/12211/12211-9006
12211-9007 - /Users/muditsharma/Documents/Knoema/Federal Statist