**DATA CLEANING**

Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

Load Dataset

In [None]:
salmonella_df = pd.read_csv("Pathogen detection Salmonella enterica.csv")

Basic Info about Dataset

In [None]:
print("\nDataset Info:")
print(f"Shape: {salmonella_df.shape}")
print("\nColumn names:")
print(salmonella_df.columns.tolist())

Missing values Check (serovar)

In [None]:
print(f"\nSerovar column analysis:")
print(f"Total rows: {len(salmonella_df)}")
print(f"Missing/null Serovar values: {salmonella_df['Serovar'].isnull().sum()}")
print(f"Empty string Serovar values: {(salmonella_df['Serovar'] == '').sum()}")

whitespace_count = salmonella_df['Serovar'].str.strip().eq('').sum()
print(f"Whitespace-only Serovar values: {whitespace_count}")

Remove Whitespace rows where Serovar is undetermined

In [None]:
initial_count = len(salmonella_df)

mask_to_keep = (
    salmonella_df['Serovar'].notna() &  # Not null
    (salmonella_df['Serovar'] != '') &  # Not empty string
    (salmonella_df['Serovar'].str.strip() != '')  # Not whitespace-only
)

salmonella_clean = salmonella_df[mask_to_keep].copy()

salmonella_clean.reset_index(drop=True, inplace=True)

final_count = len(salmonella_clean)
removed_count = initial_count - final_count

print(f"Rows removed: {removed_count}")
print(f"Remaining rows: {final_count}")
print(f"Percentage of data retained: {(final_count/initial_count)*100:.2f}%")

NameError: name 'salmonella_df' is not defined

Display the most common Serovars and AMR Genotypes

In [None]:
print(f"\nTop 10 most common Serovars after cleaning:")
serovar_counts = salmonella_clean['Serovar'].value_counts().head(10)
print(serovar_counts)

print(f"\nAMR Genotypes Analysis:")
print(f"Rows with AMR genotypes data: {salmonella_clean['AMR genotypes'].notna().sum()}")
print(f"Rows without AMR genotypes data: {salmonella_clean['AMR genotypes'].isna().sum()}")

Parsing AMR Genotypes

In [None]:
def parse_amr_genotypes(amr_string):
    """
    Parse AMR genotypes string to extract genes and their status
    Returns a dictionary of gene: status pairs
    """
    if pd.isna(amr_string) or amr_string == '':
        return {}
    
    genes = {}
    # Split by comma and process each gene
    for gene_info in amr_string.split(','):
        if '=' in gene_info:
            gene, status = gene_info.strip().split('=', 1)
            genes[gene.strip()] = status.strip()
    
    return genes

# Apply AMR parsing to create a new column with parsed data
print("Parsing AMR genotypes...")
salmonella_clean['AMR_parsed'] = salmonella_clean['AMR genotypes'].apply(parse_amr_genotypes)

Counts of unique AMR genes

In [None]:
all_genes = []
for amr_dict in salmonella_clean['AMR_parsed']:
    all_genes.extend(amr_dict.keys())

unique_genes = list(set(all_genes))
print(f"Total unique AMR genes found: {len(unique_genes)}")

# Count occurrences of each gene
gene_counts = Counter(all_genes)
print(f"\nTop 10 most common AMR genes:")
for gene, count in gene_counts.most_common(10):
    print(f"{gene}: {count}")

Summary of AMR resistance by Serovar

In [None]:
def count_amr_genes(amr_dict):
    """Count total number of AMR genes in a sample"""
    return len(amr_dict)

def count_complete_genes(amr_dict):
    """Count number of complete AMR genes in a sample"""
    return sum(1 for status in amr_dict.values() if status == 'COMPLETE')

Add summary columns to Salmonella_clean

In [None]:
salmonella_clean['AMR_gene_count'] = salmonella_clean['AMR_parsed'].apply(count_amr_genes)
salmonella_clean['AMR_complete_count'] = salmonella_clean['AMR_parsed'].apply(count_complete_genes)

serovar_amr_summary = salmonella_clean.groupby('Serovar').agg({
    'AMR_gene_count': ['count', 'mean', 'std'],
    'AMR_complete_count': ['mean', 'std']
}).round(2)

Save Cleaned Serovar Data

In [None]:
output_filename = "Salmonella_enterica_cleaned.csv"
salmonella_clean.drop(['AMR_parsed'], axis=1).to_csv(output_filename, index=False)
print(f"\nCleaned dataset saved as '{output_filename}'")

print(f"\nData cleaning complete! Your cleaned dataset has {final_count} rows and is ready for AMR Serovar analysis.")