# EBI Metadata - Missing Values Analysis

In [2]:
import pandas as pd

df = pd.read_csv('../data/metadata_EBI_preprocessed.csv',
                 dtype={'collected_by': object,
                        'collection_date': object,
                        'culture_collection': object,
                        'identified_by': object,
                        'isolate': object,
                        'isolation_source': object,
                        'keywords': object,
                        'lab_host': object,
                        'location': object,
                        'sample_accession': object,
                        'strain': object,
                        'study_accession': object})

## Dependence on Missing Values to Countries and Institutes

In [None]:
df.isna().sum()

In [None]:
# Get names of columns with missing values
cols_missing_vals = df.columns[df.isnull().any()]

In [None]:
import json

# Load number of inhabitants per country (taken from Wikipedia)
with open('json_data/country_inhabitants.json') as f:
    country_inhabitants_map = json.load(f)

In [None]:
import os

if not os.path.isdir('../plots/missing_vals_per_col'):
    os.mkdir('../plots/missing_vals_per_col')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot country distribution for each column with missing values
for col in cols_missing_vals:
    df_temp = df[df[col].isna()][[col, 'country']]

    fig, ax = plt.subplots(figsize=(42, 22))
    sns.countplot(data=df_temp,
                  x='country',
                  palette=['blue'] * len(df_temp['country']),
                  order=df_temp['country'].value_counts().index)

    fig.suptitle(f'Distribution of Countries for Rows with Missing Values in "{col}"', fontsize=22)
    plt.xlabel('Country', fontsize=18)
    plt.ylabel('Count Normalized by Number of Inhabitants', fontsize=18)
    plt.xticks(fontsize=14, rotation=90)
    plt.yticks(fontsize=14)

    # Normalize height of bars with number of inhabitants
    max_y = 0
    for i, patch in enumerate(ax.patches):
        country = ax.get_xticklabels()[i].get_text()
        if country == 'Unknown':
            new_height = 0.0
        else:
            new_height = patch.get_height() / country_inhabitants_map[country]

        max_y = max(max_y, new_height)
        patch.set_height(new_height)
    plt.gca().set_ylim([0, max_y + (max_y / 20)])

    plt.savefig(f'../plots/missing_vals_per_col/country_count_of_missing_vals_in_{col}_normalized.png', dpi=300)
    plt.close()

In [None]:
df['collected_by'].value_counts()
# 385 unique entries
# -> cannot create above plots for this column (at least not out-of-the-box, would have to summarize and/or delete entries)

## Missing Values Overview per Country

In [9]:
# Get absolute amount of missing values per country and column
n_missing = df.set_index('country').isna().sum(level=0).sort_index()
n_missing

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,accession,base_count,collected_by,collection_date,country_and_region,culture_collection,dataclass,description,environmental_sample,first_public,...,sequence_md5,sequence_version,strain,study_accession,tax_division,tax_id,topology,gender,age,n_inhabitants
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Argentina,0,0,43,0,0,43,0,0,0,0,...,0,0,43,0,0,0,0,0,43,0
Armenia,0,0,48,0,0,48,0,0,0,0,...,0,0,48,48,0,0,0,0,48,0
Australia,0,0,12977,0,0,13311,0,0,0,0,...,0,0,13311,7755,0,0,0,0,13311,0
Austria,0,0,242,0,0,242,0,0,0,0,...,0,0,242,242,0,0,0,0,242,0
Bahrain,0,0,2884,0,0,2884,0,0,0,0,...,0,0,2884,2884,0,0,0,0,2884,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela,0,0,40,0,0,40,0,0,0,0,...,0,0,40,40,0,0,0,0,40,0
Viet Nam,0,0,8,0,0,11,0,0,0,0,...,0,0,11,11,0,0,0,0,11,0
West Bank,0,0,63,0,0,63,0,0,0,0,...,0,0,63,63,0,0,0,0,63,0
Zambia,0,0,1,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0


In [4]:
# Get how often each country appears overall
country_counts = df.groupby('country').size().to_dict()
country_counts

{'Argentina': 43,
 'Armenia': 48,
 'Australia': 13311,
 'Austria': 242,
 'Bahrain': 2884,
 'Bangladesh': 666,
 'Belarus': 2,
 'Belgium': 5,
 'Belize': 4,
 'Benin': 12,
 'Brazil': 322,
 'Cambodia': 2,
 'Cameroon': 1,
 'Canada': 169,
 'Chile': 383,
 'China': 297,
 'Colombia': 9,
 'Croatia': 1,
 'Cuba': 2,
 'Czech Republic': 24,
 'Denmark': 15,
 'Djibouti': 288,
 'Dominican Republic': 9,
 'Ecuador': 4,
 'Egypt': 1071,
 'Estonia': 1681,
 'Ethiopia': 7,
 'Finland': 28,
 'France': 1618,
 'Gabon': 17,
 'Gambia': 6,
 'Georgia': 21,
 'Germany': 165037,
 'Ghana': 297,
 'Greece': 98,
 'Guam': 3,
 'Guatemala': 10,
 'Guinea': 13,
 'Hong Kong': 347,
 'Hungary': 36,
 'Iceland': 5365,
 'India': 2062,
 'Indonesia': 9,
 'Iran': 416,
 'Iraq': 284,
 'Israel': 125,
 'Italy': 333,
 'Jamaica': 8,
 'Japan': 880,
 'Jordan': 28,
 'Kazakhstan': 18,
 'Kenya': 2320,
 'Lebanon': 931,
 'Libya': 46,
 'Liechtenstein': 164,
 'Malaysia': 77,
 'Mali': 42,
 'Malta': 12,
 'Mexico': 2922,
 'Morocco': 17,
 'Myanmar': 45,
 'N

In [10]:
# Convert absolute numbers into percentages/ fractions
percentage_missing = n_missing.div(country_counts, axis=0)
percentage_missing

Unnamed: 0_level_0,accession,base_count,collected_by,collection_date,country_and_region,culture_collection,dataclass,description,environmental_sample,first_public,...,sequence_md5,sequence_version,strain,study_accession,tax_division,tax_id,topology,gender,age,n_inhabitants
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Argentina,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0
Armenia,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
Australia,0.0,0.0,0.974908,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.582601,0.0,0.0,0.0,0.0,1.0,0.0
Austria,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
Bahrain,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
Viet Nam,0.0,0.0,0.727273,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
West Bank,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0
Zambia,0.0,0.0,1.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.000000,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
import os

if not os.path.isdir('../plots/missing_vals_per_country'):
    os.mkdir('../plots/missing_vals_per_country')

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns

# For each country, generate a barplot showing the amount of missing values (as fraction) for each column
sns.set_style('whitegrid')
cols = percentage_missing.columns.tolist()

for country, count in country_counts.items():
    fig, ax = plt.subplots(figsize=(16, 10))
    country_details = percentage_missing[percentage_missing.index == country].T[country].tolist()
    plt.bar(cols, country_details)

    fig.suptitle(f'Missing Values per Column for {country} ({count} Samples)', fontsize=22)
    plt.xlabel('Column Names', fontsize=18)
    plt.ylabel('Missing Values', fontsize=18)
    plt.xticks(fontsize=14, rotation=90)
    plt.yticks(fontsize=14)

    plt.tight_layout()
    plt.savefig(f'../plots/missing_vals_per_country/missing_vals_for_{country.replace(" ", "_")}.png', dpi=300)
    plt.close()