In [None]:
import os
import zipfile
import subprocess
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shutil

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

## Explore train, test and subject info

In [None]:
train_df = pd.read_csv( "Train.csv")
test_df = pd.read_csv("Test.csv")
train_subjects_df = pd.read_csv("Train_Subjects.csv")
ss = pd.read_csv("SampleSubmission.csv")

In [None]:
print(f"Number of unique subjects in train_df: {train_df['SubjectID'].nunique()}")
print(f"Number of unique SampleType in train_df: {train_df['SampleType'].nunique()}")
print(f"Number of unique Files in train_df: {train_df['filename'].nunique()}")
print(f"Unique values in SampleType: {train_df['SampleType'].unique()}")
files_per_subject = train_df.groupby('SubjectID')['filename'].count().reset_index()
files_per_subject = files_per_subject.rename(columns={'filename': 'FileCount'})
average_files_per_subject = files_per_subject['FileCount'].mean()
print(f"Average number of files per SubjectID in train_df: {average_files_per_subject:.2f}")
plt.figure(figsize=(10, 6))
plt.hist(files_per_subject['FileCount'], bins=20, edgecolor='black')
plt.title('Distribution of Files per SubjectID in Train Data')
plt.xlabel('Number of Files')
plt.ylabel('Number of SubjectIDs')
plt.grid(axis='y', alpha=0.75)
plt.show()
train_df.head()

In [None]:
print(f"Number of unique Files in test_df: {test_df['filename'].nunique()}")
test_df.head()

## Explore fastq files

In [None]:
fastq_dir = "TrainFiles"
all_stats = []

for fname in os.listdir(fastq_dir):
    if not fname.endswith(".fastq"):
        continue

    path = os.path.join(fastq_dir, fname)
    read_lengths = []
    gc_counts = []
    nt_counts = Counter()

    for record in SeqIO.parse(path, "fastq"):
        seq = str(record.seq)
        read_lengths.append(len(seq))
        gc_counts.append(seq.count("G") + seq.count("C"))
        nt_counts.update(seq)

    if read_lengths:
        stats = {
            "file": fname,
            "num_reads": len(read_lengths),
            "avg_read_length": sum(read_lengths) / len(read_lengths),
            "avg_gc_content": sum(gc_counts) / sum(read_lengths),
            "A": nt_counts["A"],
            "T": nt_counts["T"],
            "G": nt_counts["G"],
            "C": nt_counts["C"]
        }
        all_stats.append(stats)

# Convert to DataFrame
df = pd.DataFrame(all_stats)
df.head()

In [None]:
df.info()
plt.figure()
plt.hist(df["avg_gc_content"], bins=20, color='gray')
plt.title("GC Content Distribution")
plt.xlabel("GC Content")
plt.ylabel("Number of Files")
plt.show()

In [None]:
# Define your folder and labels
fastq_dir = "TrainFiles"
labels = ['Nasal', 'Stool', 'Mouth', 'Skin']

# List all .fastq files in the folder
file_list = [f for f in os.listdir(fastq_dir) if f.endswith(".fastq")]

# Assign a random label to each file
data = {
    "filename": file_list,
    "label": [random.choice(labels) for _ in file_list]
}

# Create the DataFrame
df_labels = pd.DataFrame(data)

# Optional: sort for cleaner viewing
df_labels = df_labels.sort_values("filename").reset_index(drop=True)

# Show the result
df_labels.head()

In [None]:
# Visualise count of label
# Count the occurrences of each label
label_counts = df_labels['label'].value_counts()

# Plotting the label counts
plt.figure(figsize=(8, 6))
label_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

In [None]:
sns.heatmap(df.corr(), annot=True)

## Extract features