# EBI Metadata

## Descriptive Statistics

In [None]:
import pandas as pd

metadata_ebi = pd.read_csv('../data/results_sequence_tsv.txt',
                           sep='\t',
                           dtype={'collected_by': object,
                                  'collection_date': object,
                                  'culture_collection': object,
                                  'identified_by': object,
                                  'isolate': object,
                                  'isolation_source': object,
                                  'keywords': object,
                                  'lab_host': object,
                                  'location': object,
                                  'sample_accession': object,
                                  'strain': object,
                                  'study_accession': object})

metadata_ebi.info() # 51 cols, 2.5 mio entries

In [None]:
# Show how many NaN values each col has
metadata_ebi.isnull().sum()

In [None]:
# Extract names of cols containing only NaN values
nan_cols = [i for i in metadata_ebi.columns if metadata_ebi[i].isnull().sum() == len(metadata_ebi)]
nan_cols

In [None]:
# Reduce data frame to relevant 29 cols (5 columns do still contain a lot of NaN values)
metadata_ebi_relevant_cols = metadata_ebi.drop(nan_cols, axis=1)
metadata_ebi_relevant_cols.isnull().sum()

In [None]:
# Rename column and fill NaN values with empty strings
metadata_ebi_relevant_cols.rename({'country': 'country_and_region'}, axis=1, inplace=True)
metadata_ebi_relevant_cols['country_and_region'] = metadata_ebi_relevant_cols['country_and_region'].fillna('')

# Save as CSV
metadata_ebi_relevant_cols.to_csv('../data/metadata_EBI_relevant_cols.csv', index=False)

In [None]:
import pandas_profiling
import json

# Create fancy profile report
# See https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/advanced_usage.html for advanced usage
profile_ebi = pandas_profiling.ProfileReport(metadata_ebi_relevant_cols,
                                             title='EBI Metadata Profiling',
                                             minimal=True,
                                             correlations={'pearson': {'calculate': True},
                                                           'spearman': {'calculate': True},
                                                           'kendall': {'calculate': True},
                                                           'phi_k': {'calculate': True},
                                                           'cramers': {'calculate': False}})

# Add definitions
with open('definitions_EBI_metadata.json') as f:
    definitions_ebi_metadata = json.load(f)
profile_ebi.config.variables.descriptions = definitions_ebi_metadata

# Save as html
profile_ebi.to_file(output_file='../data/profile_EBI_metadata.html')

## Next Steps & Open Questions

- are empty columns telling us something and which other columns can be deleted?
-> meaning of cols & col values (like 'dataclass' categories)? 'location' = 'country'?
- extract age and race from 'host' for data enrichment/ more insights/ correlations
-> which other columns encode several information at once?
- 'last_updated': count over time -> is there an unequal distribution over time? / temporal bias?
- are there any empty or duplicate rows?
- look at columns with a lot of missing values -> undercoverage/ negative set bias?
- look at constant columns -> same 'tax_id', 'tax_division' or 'culture_collection' meaning overcoverage bias?
- look at columns with high cardinality
- how useful are correlation plots? -> do 'sequence_version' and 'base_count' correlate? 'identified_by' and 'base_count'? 'dataclass' and 'mol_type'?

## Data Cleaning

In [None]:
import pandas as pd

df = pd.read_csv('../data/metadata_EBI_relevant_cols.csv',
                 dtype={'collected_by': object,
                        'collection_date': object,
                        'culture_collection': object,
                        'identified_by': object,
                        'isolate': object,
                        'isolation_source': object,
                        'keywords': object,
                        'lab_host': object,
                        'location': object,
                        'sample_accession': object,
                        'strain': object,
                        'study_accession': object})
df.info()

In [None]:
# TODO: Which columns are not relevant for further analysis?
cols_to_drop = ['scientific_name', # all 'SARS-CoV-2'
                'sequence_md5', # MD5 checksum should have to further meaning as it is a hash value
                'sequence_version', # nearly all version 1
                'study_accession', # name of study accession should have no influence on biases or does it encode e.g. country?
                'location', # same as country?
                'environmental_sample' #?
]

In [None]:
# Cleanup keyword column
df['keywords'].value_counts()

In [None]:
df['keywords'].replace(['purposeofsampling:baselinesurveillance', 'puposeofsampling:baselinesurveillance', 'purpose_of_sequencing:baselinesurveillance', 'purposeofsampling=baselinesurveillance'], 'purpose_of_sampling:baseline_surveillance', inplace=True)
df['keywords'].replace(['purposeofsampling:targetedefforts', 'purposeofsampling=targetedefforts'], 'purpose_of_sampling:targeted_efforts', inplace=True)
df['keywords'].replace('purposeofsampling:targeted_sequencing', 'purpose_of_sampling:targeted_sequencing', inplace=True)

df['keywords'].value_counts()

## Data Enrichment

In [None]:
# Add gender column
df['host'].value_counts()

In [None]:
def get_gender(host_value):
    if any([substring in host_value.lower() for substring in ['female', 'femle', 'gender: f']]):
        return 'female'
    if any([substring in host_value.lower() for substring in ['male', 'gender: m']]):
        return 'male'
    else:
        return 'unknown'

In [None]:
df['gender'] = df.apply(lambda row: get_gender(str(row['host'])), axis=1)
df['gender'].value_counts()

In [None]:
# Verify if all gender descriptions are catched
df_unknown_gender = df[df['gender'] == 'unknown']

pd.set_option("display.max_rows", None)
df_unknown_gender['host'].value_counts()

In [None]:
# TODO: Add age column
def get_age(host_value):
    if 'age' in host_value.lower():
        if host_value[host_value.lower().find('age') + 3] == ' ':
            return host_value[(host_value.lower().find('age') + 4) : (host_value.lower().find('age') + 6)]
        if host_value[host_value.lower().find('age') + 3] == ':':
                if host_value[host_value.lower().find('age') + 4] == ' ':
                    return host_value[(host_value.lower().find('age') + 5) : (host_value.lower().find('age') + 7)]
                else:
                    return host_value[(host_value.lower().find('age') + 4) : (host_value.lower().find('age') + 6)]
        else:
            return host_value[(host_value.lower().find('age') + 3) : (host_value.lower().find('age') + 5)]
    if 'year old' in host_value.lower():
        return host_value[(host_value.lower().find('year old') - 3) : (host_value.lower().find('year old') - 1)]

df['age'] = df.apply(lambda row: get_age(str(row['host'])), axis=1)
df['age'].value_counts()

In [None]:
# TODO: Check how host is set if age in not numeric
df['age'] = df['age'].astype(str)
df[df['age'].apply(lambda x: not x.isnumeric())][['age', 'host']]

# TODO: Invest ages < 10

In [None]:
# TODO: Add race column

## Investigation of National Bias (Undercoverage)

In [None]:
# Extract country
df['country_and_region'] = df['country_and_region'].astype(str)
df['country'] = [country_and_region.split(':')[0] for country_and_region in df['country_and_region']]
df['country'].value_counts()

In [None]:
df.to_csv('../data/metadata_EBI_cleaned_and_enriched.csv', index=False)

In [None]:
# Separate dataframe into countries appearing often and rarely
threshold = 1000

rare_countries = df[df['country'].map(df['country'].value_counts()) < threshold]
rare_countries = rare_countries[rare_countries['country'] != '']
rare_countries['country'].value_counts()

In [None]:
popular_countries = df[df['country'].map(df['country'].value_counts()) >= threshold]
popular_countries['country'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distribution of countries appearing more than 1,000 times
for hue in ['dataclass', 'mol_type', 'keywords', 'gender']:
    fig, ax = plt.subplots(figsize=(18, 14))
    sns.countplot(data=popular_countries,
                  x='country',
                  hue=hue,
                  order=popular_countries['country'].value_counts().index)

    fig.suptitle('Distribution of Countries with >= 1,000 Samples', fontsize=20)
    plt.xlabel('Country', fontsize=16)
    plt.ylabel('Count', fontsize=16)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=45)
    plt.legend(loc='upper right', title=f'{hue.capitalize()}:')

    plt.savefig(f'../plots/popular_countries_count_by_{hue}.png')

In [None]:
# Plot distribution of countries appearing less than 1,000 times
for hue in ['dataclass', 'mol_type', 'keywords', 'gender']:
    fig, ax = plt.subplots(figsize=(36, 18))
    sns.countplot(data=rare_countries,
                  x='country',
                  hue=hue,
                  order=rare_countries['country'].value_counts().index)

    fig.suptitle('Distribution of Countries with < 1,000 Samples', fontsize=20)
    plt.xlabel('Country', fontsize=16)
    plt.ylabel('Count', fontsize=16)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
    plt.legend(loc='upper right', title=f'{hue.capitalize()}:')

    plt.savefig(f'../plots/rare_countries_count_by_{hue}.png')