In [2]:
import pandas as pd
import io

# --- Step 1: Load data ---
df = pd.read_csv("data/song_lyrics.csv")

# --- Step 2: Open report file ---
with open("data_exploration.txt", "w", encoding="utf-8") as f:
    # Basic info
    f.write("DATASET EXPLORATION REPORT\n")
    f.write("="*40 + "\n\n")
    f.write(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns\n\n")

    # Column info
    buffer = io.StringIO()
    df.info(buf=buffer)
    f.write("BASIC INFO\n")
    f.write(buffer.getvalue() + "\n")

    # Missing values
    f.write("MISSING VALUES\n")
    f.write(df.isnull().sum().to_string() + "\n\n")

    # Column analysis
    for col in df.columns:
        f.write(f"\n--- {col} ---\n")
        nunique = df[col].nunique()
        f.write(f"Unique values: {nunique}\n")
        
        if df[col].dtype == "object" or nunique < 50:
            value_counts = df[col].value_counts(dropna=False)
            f.write("\nMost popular 5:\n")
            f.write(value_counts.head(5).to_string() + "\n")
            f.write("\nLeast popular 5:\n")
            f.write(value_counts.tail(5).to_string() + "\n")
        else:
            f.write("\nNumeric column summary:\n")
            f.write(df[col].describe().to_string() + "\n")

print("✅ Report saved to data_exploration.txt")


✅ Report saved to data_exploration.txt
