# 🎯 Visual EDA - Feature Plots

### 📦 Install & Import 🔧

In [None]:
!pip install sweetviz pandas-profiling matplotlib seaborn numpy==1.24.4

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
import os

### 🔍 Run Sweetviz Report

In [None]:
# Load your FASTA dataframe
fasta_df = pd.read_csv("data/processed/human_fasta_clean.csv")

# Create + save the EDA report
report = sv.analyze(fasta_df)
report.show_html("eda/fasta_eda_report.html")

print("✅ Sweetviz report generated → eda/fasta_eda_report.html")

### 📊 Then run your k-mer plots + sequence lengths

In [None]:
# Load k-mer encoded data
kmer_df = pd.read_csv("data/processed/fasta_kmer_6mer.csv")

# Plot top 20 k-mers
kmer_sums = kmer_df.sum().sort_values(ascending=False).head(20)

plt.figure(figsize=(10, 5))
sns.barplot(x=kmer_sums.values, y=kmer_sums.index, palette="viridis")
plt.title("Top 20 Most Frequent 6-mers")
plt.xlabel("Frequency")
plt.ylabel("6-mer")
plt.tight_layout()
plt.savefig("data/outputs/top_20_kmers.png")
plt.show()

### 📏 Sequence Length Distribution

In [None]:
fasta_df = pd.read_csv("data/processed/human_fasta_clean.csv")

plt.figure(figsize=(10, 4))
sns.histplot(fasta_df["Length"], bins=30, kde=True, color="skyblue")
plt.title("Distribution of Sequence Lengths")
plt.xlabel("Length (bp)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("data/outputs/sequence_length_distribution.png")
plt.show()