# Data understanding and Exploratory analysis (EDA)

Dataset: 2 March 2022 from GDrive
https://drive.google.com/drive/folders/1hm4YNib_kI5Y4k7xp4E47w8qDjUIsagA?usp=sharing

## Data Processing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
fn_sed = "../Dataset/Sediment_Prevalence_10__commat_2021-09-29.csv"
fn_water = "../Dataset/Water_Prevalence_10__commat_2021-09-29.csv"
df_sed = pd.read_csv(fn_sed)
print('Sediment: ' + str(df_sed.shape))

df_water = pd.read_csv(fn_water)
print('Water: ' + str(df_water.shape))

# Data quality check
print(df_sed.columns.nunique())
print(df_water.columns.nunique())

# Check how many overlapping CF
print(len(set(df_sed.columns).intersection(set(df_water.columns))))

# Join datasets
df = pd.concat([df_sed, df_water], join="outer")
print(df.shape)

# Check if there is any identical columns / CF across all samples
print("Duplicated column: " + str(df.columns.duplicated().any()))

# Pre-processing
df = df.rename(columns={"Unnamed: 0": "sample_id"})
df = df.set_index("sample_id").fillna(0)
df = df.apply(pd.to_numeric).astype(int).reset_index()
df['type'] = df['sample_id'].str.split('_').str[0]
print(df.shape)
df.head()

Sediment: (239, 4053)
Water: (265, 4936)
4053
4936
2109
(504, 6880)
Duplicated column: False
(504, 6881)


Unnamed: 0,sample_id,C10H10O5,C10H10O5N4,C10H10O6,C10H10O6N4,C10H10O7,C10H10O8,C10H11O4N5,C10H11O5N,C10H11O5N5,...,C9H14O6S,C9H16O5S,C9H16O9N4S,C9H20O5NS2P,C9H7O5N,C9H7O6N,C9H8O6S,C9H8O8,C9H9O6N,type
0,SED_S19S.0001_D_Field,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,SED
1,SED_S19S.0001_M_Field,1,1,1,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,SED
2,SED_S19S.0001_U_Field,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,SED
3,SED_S19S.0003_D_Field,1,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,SED
4,SED_S19S.0003_M_Field,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,SED


## Clustering the samples based on the CF, suggested not to reduce the dimension of CF for now

In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler