In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from src.opendata import OpenDataZH

odz = OpenDataZH()

In [None]:
package_id = "bau_hae_lima_zuordnung_adr_quartier_bzo16_bzo99_od5143"
resource_id = None

In [None]:
# Use the name of a dataset, e.g., from the OpenDataZH site or the list above, to get the package
package = odz.get_package(name=package_id)

In [None]:
package.display_metadata()

In [None]:
package.display_resource_summary()

In [None]:
package.tabular_resource_metadata_df.head()[["filename", "format"]]

In [None]:
resource = package.tabular_resource(id=resource_id) if resource_id is not None and resource_id.upper() != "NONE" else package.tabular_resource(0)
resource.display_metadata()

In [None]:
df = resource.df

In [None]:
# drop columns that have no values
df.dropna(how='all', axis=1, inplace=True)

In [None]:
print(f'The dataset has {df.shape[0]:,.0f} rows (observations) and {df.shape[1]:,.0f} columns (variables).')
print(f'There seem to be {df.duplicated().sum()} exact duplicates in the data.')

In [None]:
df.info(memory_usage='deep', verbose=True)

In [None]:
df.head()

In [None]:
# display a small random sample transposed in order to see all variables
df.sample(3).T

In [None]:
# describe non-numerical features
try:
    with pd.option_context('display.float_format', '{:,.2f}'.format):
        display(df.describe(exclude='number'))
except:
    print("No categorical data in dataset.")

In [None]:
# describe numerical features
try:
    with pd.option_context('display.float_format', '{:,.2f}'.format):
        display(df.describe(include='number'))
except:
    print("No numercial data in dataset.")

In [None]:
# check missing values with missingno
# https://github.com/ResidentMario/missingno
import missingno as msno
msno.matrix(df, labels=True, sort='descending');

In [None]:
# plot a histogram for each numerical feature
try:
    df.hist(bins=25, rwidth=.9)
    plt.tight_layout()
    plt.show()
except:
    print("No numercial data to plot.")