In [2]:
import pandas as pd

In [3]:
def split_and_sort_ngrams(input_csv: str, output_prefix: str = "ngrams"):
    df = pd.read_csv(input_csv)

    # Ensure consistent column names
    df.columns = [col.strip().lower() for col in df.columns]
    
    # For each unique n value
    for n_val in sorted(df["n"].unique()):
        df_n = df[df["n"] == n_val].copy()

        # Sort by frequency descending
        df_n.sort_values(by="freq", ascending=False, inplace=True)

        # Create filename based on n
        if n_val == 1:
            out_file = f"{output_prefix}_unigrams.csv"
        elif n_val == 2:
            out_file = f"{output_prefix}_bigrams_llr.csv"
        else:
            out_file = f"{output_prefix}_{n_val}grams.csv"

        df_n.to_csv(out_file, index=False)
        print(f"✅ Saved: {out_file}")


In [4]:
split_and_sort_ngrams("mimic_cxr_training_labelled_vocab.csv", "mimic_cxr_ngrams")

✅ Saved: mimic_cxr_ngrams_unigrams.csv
✅ Saved: mimic_cxr_ngrams_bigrams_llr.csv
✅ Saved: mimic_cxr_ngrams_3grams.csv
