In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re

# Path to your input file
file_path = "/content/drive/My Drive/compscfiles/inferCNV_out/annotations.txt"

# Prepare storage for rows
rows = []

# Parse the file
with open(file_path, 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        parts = line.split('\t')
        if len(parts) < 2:
            continue  # Skip if no annotation

        cell_id, annotation = parts
        if annotation.lower() == "normal":
            continue  # Skip normal cells

        cnv_entries = annotation.split(';')
        for cnv in cnv_entries:
            cnv = cnv.strip()
            match = re.match(r"(loss|gain)_(\d+)_([\d]+)-([\d]+)", cnv)
            if match:
                event, chrom, start, end = match.groups()
                rows.append({
                    "cell_id": cell_id,
                    "event": event,
                    "chromosome": f"chr{chrom}",
                    "start": int(start),
                    "end": int(end)
                })

# Create DataFrame
df = pd.DataFrame(rows)

# Preview
print(df.head())



              cell_id event chromosome  start  end
0  TTTCATGGTAGCTAAA-1  loss       chr1      5   20
1  TTTCATGGTAGCTAAA-1  gain       chr9    150  300
2  TGATGCAGTCTTCAAG-1  gain      chr22      5   20
3  TGATGCAGTCTTCAAG-1  loss      chr16      5   20
4  TGATGCAGTCTTCAAG-1  loss       chr1      5   20


In [None]:
import pandas as pd

# Copy original dataframe
df2 = df.copy()

# Step 1: Merge overlapping CNVs per chromosome and event
def merge_overlapping_regions(sub_df):
    """Merge overlapping intervals within the same chromosome and event"""
    sub_df = sub_df.sort_values(by='start').reset_index(drop=True)
    merged = []
    current = sub_df.iloc[0].copy()

    for i in range(1, len(sub_df)):
        row = sub_df.iloc[i]
        if row['start'] <= current['end']:  # overlapping
            current['end'] = max(current['end'], row['end'])
        else:
            merged.append(current.copy())
            current = row.copy()
    merged.append(current)
    return pd.DataFrame(merged)

# Step 2: Apply merging per chromosome + event (gain/loss)
merged_dfs = []
for (chrom, event), group in df2.groupby(['chromosome', 'event']):
    merged = merge_overlapping_regions(group[['chromosome', 'start', 'end']])
    merged['event'] = event
    merged['chromosome'] = chrom
    merged_dfs.append(merged)

# Combine all merged regions
merged_regions = pd.concat(merged_dfs, ignore_index=True)

# Step 3: Assign each row in df2 to a merged region string
def assign_region(row, regions):
    matches = regions[
        (regions['chromosome'] == row['chromosome']) &
        (regions['event'] == row['event']) &
        (regions['start'] <= row['end']) &
        (regions['end'] >= row['start'])
    ]
    if not matches.empty:
        match = matches.iloc[0]
        return f"{match['event']}_{match['chromosome'].replace('chr', '')}_{match['start']}-{match['end']}"
    return None

df2['region_str'] = df2.apply(lambda row: assign_region(row, merged_regions), axis=1)

# Step 4: Aggregate CNVs per cell
cell_to_cnv = df2.groupby('cell_id')['region_str'].apply(
    lambda x: ';'.join(sorted(set(x.dropna())))
).reset_index()

cell_to_cnv.columns = ['cell_id', 'simulated_cnvs']

# Step 5: Count unique CNV patterns
cnv_counts = cell_to_cnv['simulated_cnvs'].value_counts()

# Show the result
print(cnv_counts)



simulated_cnvs
loss_2_30-100                                                             41
loss_1_5-20;loss_2_30-100                                                 25
gain_9_150-300;loss_1_5-20                                                23
loss_1_5-20                                                               20
gain_9_150-300;loss_2_30-100                                              18
                                                                          ..
gain_16_150-300;gain_4_30-100;gain_9_150-300;loss_16_5-20;loss_1_5-20      1
gain_16_150-300;gain_4_30-100;gain_8_30-100;loss_16_5-20;loss_2_30-100     1
gain_16_150-300;gain_22_5-20;loss_16_5-20                                  1
gain_8_30-100;loss_16_5-20;loss_1_5-20                                     1
gain_16_150-300;gain_8_30-100;loss_2_30-100                                1
Name: count, Length: 132, dtype: int64


In [None]:
# Step 1: Extract just the relevant columns
df_detected = df2[['cell_id', 'event', 'chromosome', 'start', 'end']].drop_duplicates()

# Step 2: Count how many cells each unique CNV was found in
cnv_summary = (
    df_detected
    .groupby(['event', 'chromosome', 'start', 'end'])
    .agg(n_cells=('cell_id', 'nunique'))
    .reset_index()
)

# Step 3: Add frequency if you want (optional)
total_cells = df2['cell_id'].nunique()
cnv_summary['frequency'] = cnv_summary['n_cells'] / total_cells

# Step 4: Sort by most frequent
cnv_summary = cnv_summary.sort_values('n_cells', ascending=False)

# View
print(cnv_summary)


  event chromosome  start  end  n_cells  frequency
8  loss       chr2     30  100      314   0.566787
6  loss       chr1      5   20      250   0.451264
5  gain       chr9    150  300      224   0.404332
3  gain       chr4     30  100      156   0.281588
1  gain      chr16    150  300      155   0.279783
7  loss      chr16      5   20      146   0.263538
4  gain       chr8     30  100       48   0.086643
2  gain      chr22      5   20       48   0.086643
0  gain      chr11    150  300       30   0.054152
