# Exploratory Data Analysis & Visualization

## Prerequisites

In [None]:
# Install essential EDA + Viz tools (stable versions only)
!pip install sweetviz ydata-profiling kaleido --quiet

In [None]:
import pandas as pd
import sweetviz as sv
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import numpy as np

## Automated EDA Reports

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Generate frequencies (e.g. for gene snippets or kmers)
frequencies = df['SEQUENCE'].value_counts().to_dict()

# Check if we actually have content
if not frequencies:
    raise ValueError("❌ No valid sequence frequencies found for word cloud.")

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate_from_frequencies(frequencies)

# Plot it
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("🧬 Word Cloud of Most Frequent Sequences")
plt.show()


In [None]:
sweetviz_report = sv.analyze(df.astype(str))  # Fix mixed-type bug
sweetviz_report.show_html("eda_sweetviz_report.html")
print("✅ Sweetviz report saved to: eda_sweetviz_report.html")

## Sequence Visualizations

### Standartization of data

In [None]:
print(df.columns.tolist())


In [None]:
df.rename(columns=lambda x: x.strip().upper(), inplace=True)

In [None]:
print(df.columns.tolist())

In [None]:
# ✅ Sequence Length Distribution
df['Length'] = df['SEQUENCE'].str.len()
sns.histplot(df['Length'], kde=True, bins=50)
plt.title("🧬 Distribution of RNA Sequence Lengths")
plt.xlabel("Sequence Length")
plt.ylabel("Count")
plt.show()

In [None]:
# ✅ Top Expressed Gene Snippets
top_snippets = df['SEQUENCE'].value_counts().head(10)
top_snippets.plot(kind='barh', title="🔝 Top 10 Most Frequent Gene Sequences", color='mediumseagreen')
plt.xlabel("Frequency")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# ✅ GC Content Heatmap
def gc_content(seq):
    return (seq.count('G') + seq.count('C')) / len(seq) if len(seq) > 0 else 0

df['GC_Content'] = df['SEQUENCE'].apply(gc_content)
sns.heatmap(df[['GC_Content']].T, cmap="viridis", cbar=True)
plt.title("🌡️ Heatmap of GC Content in Sequences")
plt.show()

## Visual Storytelling Snippets

In [None]:
# ✅ DNA-scale context captions
fig = px.histogram(df, x='Length', nbins=60, title="📈 DNA Sequence Lengths (~600K markers scale)")
fig.update_layout(xaxis_title="Sequence Length", yaxis_title="Frequency")
fig.write_image("story_dna_lengths.png")  # Saved for blog/video caption use
fig.show()

In [None]:
# ✅ AI-ready data storytelling
example_kmers = df['SEQUENCE'].iloc[0][:20]  # First 20 chars of one example
print(f"🔬 'Here are real human genes from NCBI, sliced into k-mers, ready for AI: {example_kmers}'")