# EBI Metadata - Missing Values Analysis

In [None]:
import pandas as pd

df = pd.read_csv('../data/metadata_EBI_preprocessed.csv',
                 dtype={'collected_by': object,
                        'collection_date': object,
                        'culture_collection': object,
                        'identified_by': object,
                        'isolate': object,
                        'isolation_source': object,
                        'keywords': object,
                        'lab_host': object,
                        'location': object,
                        'sample_accession': object,
                        'strain': object,
                        'study_accession': object})

## Dependence on Missing Values to Countries and Institutes

In [None]:
df.isna().sum()

In [None]:
# Get names of columns with missing values
cols_missing_vals = df.columns[df.isnull().any()]

In [None]:
import json

# Load number of inhabitants per country (taken from Wikipedia)
with open('json_data/country_inhabitants_map.json') as f:
    country_inhabitants_map = json.load(f)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot country distribution for each column with missing values
for col in cols_missing_vals:
    df_temp = df[df[col].isna()][[col, 'country']]

    fig, ax = plt.subplots(figsize=(42, 22))
    sns.countplot(data=df_temp,
                  x='country',
                  palette=['blue'] * len(df_temp['country']),
                  order=df_temp['country'].value_counts().index)

    fig.suptitle(f'Distribution of Countries for Rows with Missing Values in "{col}"', fontsize=22)
    plt.xlabel('Country', fontsize=18)
    plt.ylabel('Count Normalized by Number of Inhabitants', fontsize=18)
    plt.xticks(fontsize=14, rotation=90)
    plt.yticks(fontsize=14)

    # Normalize height of bars with number of inhabitants
    max_y = 0
    for i, patch in enumerate(ax.patches):
        country = ax.get_xticklabels()[i].get_text()
        if country == 'Unknown':
            new_height = 0.0
        else:
            new_height = patch.get_height() / country_inhabitants_map[country]

        max_y = max(max_y, new_height)
        patch.set_height(new_height)
    plt.gca().set_ylim([0, max_y + (max_y / 20)])

    plt.savefig(f'../plots/missing_vals/country_count_of_missing_vals_in_{col}_normalized.png', dpi=300)

In [None]:
df['collected_by'].value_counts()
# 385 unique entries
# -> cannot create above plots for this column (at least not out-of-the-box, would have to summarize and/or delete entries)

## Country Cards

For each country, the following code cells generate a PDF page with the number of missing values in percent for each column.

In [None]:
# TODO